X-Git-Url: https://git.blender.org/gitweb/gitweb.cgi/blender.git/blobdiff_plain/df7b9fa2eeb5908de4e1b3c2c6f7cf30329f1e3d..fa3d50af95fde76ef08590d2f86444f2f9fdca95:/intern/cycles/kernel/kernels/cuda/filter.cu diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index c8172355a7f..035f0484488 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -134,95 +134,140 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_difference(int dx, int dy, - const float *ccl_restrict weight_image, +kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, float *difference_image, - int4 rect, int w, + int w, + int h, + int stride, + int shift_stride, + int r, int channel_offset, - float a, float k_2) + float a, + float k_2) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, + weight_image, + variance_image, + difference_image + ofs, + rect, stride, + channel_offset, a, k_2); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, + float *out_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_blur(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_weight(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_update_output(int dx, int dy, - const float *ccl_restrict difference_image, +kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, const float *ccl_restrict image, - float *out_image, float *accum_image, - int4 rect, int w, + float *out_image, + float *accum_image, + int w, + int h, + int stride, + int shift_stride, + int r, int f) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, + difference_image + ofs, + image, + out_image, + accum_image, + rect, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w) +kernel_cuda_filter_nlm_normalize(float *out_image, + const float *ccl_restrict accum_image, + int w, + int h, + int stride) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < w && y < h) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, - const float *ccl_restrict difference_image, +kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float const* __restrict__ transform, int *rank, float *XtWX, float3 *XtWY, - int4 rect, - int4 filter_rect, - int w, int h, int f, + int4 filter_window, + int w, + int h, + int stride, + int shift_stride, + int r, + int f, int pass_stride) { - int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); - int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); - if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { - kernel_filter_nlm_construct_gramian(x, y, - dx, dy, - difference_image, + int4 co, rect; + int ofs; + if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + kernel_filter_nlm_construct_gramian(co.x, co.y, + co.z, co.w, + difference_image + ofs, buffer, transform, rank, XtWX, XtWY, - rect, filter_rect, - w, h, f, + rect, filter_window, + stride, f, pass_stride, threadIdx.y*blockDim.x + threadIdx.x); } @@ -230,10 +275,12 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_finalize(int w, int h, - float *buffer, int *rank, - float *XtWX, float3 *XtWY, - int4 filter_area, int4 buffer_params, +kernel_cuda_filter_finalize(float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int4 filter_area, + int4 buffer_params, int sample) { int x = blockDim.x*blockIdx.x + threadIdx.x; @@ -243,7 +290,10 @@ kernel_cuda_filter_finalize(int w, int h, rank += storage_ofs; XtWX += storage_ofs; XtWY += storage_ofs; - kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + kernel_filter_finalize(x, y, buffer, rank, + filter_area.z*filter_area.w, + XtWX, XtWY, + buffer_params, sample); } }