2 * Copyright 2011-2017 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 /* CUDA kernel entry points */
21 #include "kernel_config.h"
23 #include "kernel/kernel_compat_cuda.h"
25 #include "kernel/filter/filter_kernel.h"
29 extern "C" __global__ void
30 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
31 kernel_cuda_filter_divide_shadow(int sample,
35 float *sampleVariance,
36 float *sampleVarianceV,
37 float *bufferVariance,
39 int buffer_pass_stride,
40 int buffer_denoising_offset,
41 bool use_split_variance)
43 int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
44 int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
45 if(x < prefilter_rect.z && y < prefilter_rect.w) {
46 kernel_filter_divide_shadow(sample,
56 buffer_denoising_offset,
61 extern "C" __global__ void
62 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
63 kernel_cuda_filter_get_feature(int sample,
70 int buffer_pass_stride,
71 int buffer_denoising_offset,
72 bool use_split_variance)
74 int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
75 int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
76 if(x < prefilter_rect.z && y < prefilter_rect.w) {
77 kernel_filter_get_feature(sample,
84 buffer_denoising_offset,
89 extern "C" __global__ void
90 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
91 kernel_cuda_filter_detect_outliers(float *image,
98 int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
99 int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
100 if(x < prefilter_rect.z && y < prefilter_rect.w) {
101 kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
105 extern "C" __global__ void
106 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
107 kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
109 int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
110 int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
111 if(x < prefilter_rect.z && y < prefilter_rect.w) {
112 kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
116 extern "C" __global__ void
117 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
118 kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
119 float *transform, int *rank,
120 int4 filter_area, int4 rect,
121 int radius, float pca_threshold,
124 int x = blockDim.x*blockIdx.x + threadIdx.x;
125 int y = blockDim.y*blockIdx.y + threadIdx.y;
126 if(x < filter_area.z && y < filter_area.w) {
127 int *l_rank = rank + y*filter_area.z + x;
128 float *l_transform = transform + y*filter_area.z + x;
129 kernel_filter_construct_transform(buffer,
130 x + filter_area.x, y + filter_area.y,
133 radius, pca_threshold,
134 filter_area.z*filter_area.w,
135 threadIdx.y*blockDim.x + threadIdx.x);
139 extern "C" __global__ void
140 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
141 kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
142 const float *ccl_restrict weightImage,
143 const float *ccl_restrict varianceImage,
144 float *differenceImage,
147 float a, float k_2) {
148 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
149 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
150 if(x < rect.z && y < rect.w) {
151 kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
155 extern "C" __global__ void
156 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
157 kernel_cuda_filter_nlm_blur(const float *ccl_restrict differenceImage, float *outImage, int4 rect, int w, int f) {
158 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
159 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
160 if(x < rect.z && y < rect.w) {
161 kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
165 extern "C" __global__ void
166 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
167 kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict differenceImage, float *outImage, int4 rect, int w, int f) {
168 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
169 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
170 if(x < rect.z && y < rect.w) {
171 kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
175 extern "C" __global__ void
176 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
177 kernel_cuda_filter_nlm_update_output(int dx, int dy,
178 const float *ccl_restrict differenceImage,
179 const float *ccl_restrict image,
180 float *outImage, float *accumImage,
183 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
184 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
185 if(x < rect.z && y < rect.w) {
186 kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
190 extern "C" __global__ void
191 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
192 kernel_cuda_filter_nlm_normalize(float *outImage, const float *ccl_restrict accumImage, int4 rect, int w) {
193 int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
194 int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
195 if(x < rect.z && y < rect.w) {
196 kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
200 extern "C" __global__ void
201 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
202 kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
203 const float *ccl_restrict differenceImage,
204 const float *ccl_restrict buffer,
206 float *variance_pass,
207 float const* __restrict__ transform,
215 int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
216 int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
217 if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
218 kernel_filter_nlm_construct_gramian(x, y,
222 color_pass, variance_pass,
228 threadIdx.y*blockDim.x + threadIdx.x);
232 extern "C" __global__ void
233 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
234 kernel_cuda_filter_finalize(int w, int h,
235 float *buffer, int *rank,
236 float *XtWX, float3 *XtWY,
237 int4 filter_area, int4 buffer_params,
239 int x = blockDim.x*blockIdx.x + threadIdx.x;
240 int y = blockDim.y*blockIdx.y + threadIdx.y;
241 if(x < filter_area.z && y < filter_area.w) {
242 int storage_ofs = y*filter_area.z+x;
246 kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);