2 * Copyright 2011-2017 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 /* CUDA kernel entry points */
21 #include "kernel_config.h"
23 #include "kernel/kernel_compat_cuda.h"
25 #include "kernel/filter/filter_kernel.h"
29 extern "C" __global__ void
30 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
31 kernel_cuda_filter_divide_shadow(int sample,
35 float *sampleVariance,
36 float *sampleVarianceV,
37 float *bufferVariance,
39 int buffer_pass_stride,
40 int buffer_denoising_offset)
42 int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
43 int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
44 if(x < prefilter_rect.z && y < prefilter_rect.w) {
45 kernel_filter_divide_shadow(sample,
55 buffer_denoising_offset);
59 extern "C" __global__ void
60 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
61 kernel_cuda_filter_get_feature(int sample,
68 int buffer_pass_stride,
69 int buffer_denoising_offset)
71 int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
72 int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
73 if(x < prefilter_rect.z && y < prefilter_rect.w) {
74 kernel_filter_get_feature(sample,
81 buffer_denoising_offset);
85 extern "C" __global__ void
86 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
87 kernel_cuda_filter_detect_outliers(float *image,
94 int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
95 int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
96 if(x < prefilter_rect.z && y < prefilter_rect.w) {
97 kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
101 extern "C" __global__ void
102 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
103 kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
105 int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
106 int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
107 if(x < prefilter_rect.z && y < prefilter_rect.w) {
108 kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
112 extern "C" __global__ void
113 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
114 kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
115 float *transform, int *rank,
116 int4 filter_area, int4 rect,
117 int radius, float pca_threshold,
120 int x = blockDim.x*blockIdx.x + threadIdx.x;
121 int y = blockDim.y*blockIdx.y + threadIdx.y;
122 if(x < filter_area.z && y < filter_area.w) {
123 int *l_rank = rank + y*filter_area.z + x;
124 float *l_transform = transform + y*filter_area.z + x;
125 kernel_filter_construct_transform(buffer,
126 x + filter_area.x, y + filter_area.y,
129 radius, pca_threshold,
130 filter_area.z*filter_area.w,
131 threadIdx.y*blockDim.x + threadIdx.x);
135 extern "C" __global__ void
136 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
137 kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
138 const float *ccl_restrict weight_image,
139 const float *ccl_restrict variance_image,
140 float *difference_image,
145 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
146 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
147 if(x < rect.z && y < rect.w) {
148 kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
152 extern "C" __global__ void
153 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
154 kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
156 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
157 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
158 if(x < rect.z && y < rect.w) {
159 kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
163 extern "C" __global__ void
164 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
165 kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
167 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
168 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
169 if(x < rect.z && y < rect.w) {
170 kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
174 extern "C" __global__ void
175 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
176 kernel_cuda_filter_nlm_update_output(int dx, int dy,
177 const float *ccl_restrict difference_image,
178 const float *ccl_restrict image,
179 float *out_image, float *accum_image,
183 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
184 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
185 if(x < rect.z && y < rect.w) {
186 kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
190 extern "C" __global__ void
191 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
192 kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
194 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
195 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
196 if(x < rect.z && y < rect.w) {
197 kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
201 extern "C" __global__ void
202 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
203 kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
204 const float *ccl_restrict difference_image,
205 const float *ccl_restrict buffer,
206 float const* __restrict__ transform,
215 int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
216 int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
217 if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
218 kernel_filter_nlm_construct_gramian(x, y,
227 threadIdx.y*blockDim.x + threadIdx.x);
231 extern "C" __global__ void
232 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
233 kernel_cuda_filter_finalize(int w, int h,
234 float *buffer, int *rank,
235 float *XtWX, float3 *XtWY,
236 int4 filter_area, int4 buffer_params,
239 int x = blockDim.x*blockIdx.x + threadIdx.x;
240 int y = blockDim.y*blockIdx.y + threadIdx.y;
241 if(x < filter_area.z && y < filter_area.w) {
242 int storage_ofs = y*filter_area.z+x;
246 kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);