Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / filter / filter_nlm_cpu.h
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 CCL_NAMESPACE_BEGIN
18
19 ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
20                                                          const float *ccl_restrict weight_image,
21                                                          const float *ccl_restrict variance_image,
22                                                          float *difference_image,
23                                                          int4 rect,
24                                                          int stride,
25                                                          int channel_offset,
26                                                          float a,
27                                                          float k_2)
28 {
29         for(int y = rect.y; y < rect.w; y++) {
30                 for(int x = rect.x; x < rect.z; x++) {
31                         float diff = 0.0f;
32                         int numChannels = channel_offset? 3 : 1;
33                         for(int c = 0; c < numChannels; c++) {
34                                 float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
35                                 float pvar = variance_image[c*channel_offset + y*stride + x];
36                                 float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
37                                 diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
38                         }
39                         if(numChannels > 1) {
40                                 diff *= 1.0f/numChannels;
41                         }
42                         difference_image[y*stride + x] = diff;
43                 }
44         }
45 }
46
47 ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
48                                               float *out_image,
49                                               int4 rect,
50                                               int stride,
51                                               int f)
52 {
53         int aligned_lowx = rect.x / 4;
54         int aligned_highx = (rect.z + 3) / 4;
55         for(int y = rect.y; y < rect.w; y++) {
56                 const int low = max(rect.y, y-f);
57                 const int high = min(rect.w, y+f+1);
58                 for(int x = rect.x; x < rect.z; x++) {
59                         out_image[y*stride + x] = 0.0f;
60                 }
61                 for(int y1 = low; y1 < high; y1++) {
62                         float4* out_image4 = (float4*)(out_image + y*stride);
63                         float4* difference_image4 = (float4*)(difference_image + y1*stride);
64                         for(int x = aligned_lowx; x < aligned_highx; x++) {
65                                 out_image4[x] += difference_image4[x];
66                         }
67                 }
68                 for(int x = rect.x; x < rect.z; x++) {
69                         out_image[y*stride + x] *= 1.0f/(high - low);
70                 }
71         }
72 }
73
74 ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
75                                                      float *out_image,
76                                                      int4 rect,
77                                                      int stride,
78                                                      int f)
79 {
80         for(int y = rect.y; y < rect.w; y++) {
81                 for(int x = rect.x; x < rect.z; x++) {
82                         out_image[y*stride + x] = 0.0f;
83                 }
84         }
85         for(int dx = -f; dx <= f; dx++) {
86                 int pos_dx = max(0, dx);
87                 int neg_dx = min(0, dx);
88                 for(int y = rect.y; y < rect.w; y++) {
89                         for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
90                                 out_image[y*stride + x] += difference_image[y*stride + x+dx];
91                         }
92                 }
93         }
94         for(int y = rect.y; y < rect.w; y++) {
95                 for(int x = rect.x; x < rect.z; x++) {
96                         const int low = max(rect.x, x-f);
97                         const int high = min(rect.z, x+f+1);
98                         out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
99                 }
100         }
101 }
102
103 ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
104                                                        const float *ccl_restrict difference_image,
105                                                        const float *ccl_restrict image,
106                                                        float *out_image,
107                                                        float *accum_image,
108                                                        int4 rect,
109                                                        int stride,
110                                                        int f)
111 {
112         for(int y = rect.y; y < rect.w; y++) {
113                 for(int x = rect.x; x < rect.z; x++) {
114                         const int low = max(rect.x, x-f);
115                         const int high = min(rect.z, x+f+1);
116                         float sum = 0.0f;
117                         for(int x1 = low; x1 < high; x1++) {
118                                 sum += difference_image[y*stride + x1];
119                         }
120                         float weight = sum * (1.0f/(high - low));
121                         accum_image[y*stride + x] += weight;
122                         out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
123                 }
124         }
125 }
126
127 ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
128                                                            const float *ccl_restrict difference_image,
129                                                            const float *ccl_restrict buffer,
130                                                            float *transform,
131                                                            int *rank,
132                                                            float *XtWX,
133                                                            float3 *XtWY,
134                                                            int4 rect,
135                                                            int4 filter_window,
136                                                            int stride, int f,
137                                                            int pass_stride)
138 {
139         int4 clip_area = rect_clip(rect, filter_window);
140         /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
141         for(int y = clip_area.y; y < clip_area.w; y++) {
142                 for(int x = clip_area.x; x < clip_area.z; x++) {
143                         const int low = max(rect.x, x-f);
144                         const int high = min(rect.z, x+f+1);
145                         float sum = 0.0f;
146                         for(int x1 = low; x1 < high; x1++) {
147                                 sum += difference_image[y*stride + x1];
148                         }
149                         float weight = sum * (1.0f/(high - low));
150
151                         int storage_ofs = coord_to_local_index(filter_window, x, y);
152                         float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
153                         float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
154                         float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
155                         int    *l_rank = rank + storage_ofs;
156
157                         kernel_filter_construct_gramian(x, y, 1,
158                                                         dx, dy,
159                                                         stride,
160                                                         pass_stride,
161                                                         buffer,
162                                                         l_transform, l_rank,
163                                                         weight, l_XtWX, l_XtWY, 0);
164                 }
165         }
166 }
167
168 ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
169                                                    const float *ccl_restrict accum_image,
170                                                    int4 rect,
171                                                    int w)
172 {
173         for(int y = rect.y; y < rect.w; y++) {
174                 for(int x = rect.x; x < rect.z; x++) {
175                         out_image[y*w+x] /= accum_image[y*w+x];
176                 }
177         }
178 }
179
180 CCL_NAMESPACE_END