X-Git-Url: https://git.blender.org/gitweb/gitweb.cgi/blender.git/blobdiff_plain/43b374e8c5430488a302298b1026faa1c3a231e9..fa3d50af95fde76ef08590d2f86444f2f9fdca95:/intern/cycles/kernel/filter/filter_nlm_gpu.h diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index b5ba7cf51a5..4ca49ea6733 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -16,132 +16,187 @@ CCL_NAMESPACE_BEGIN +/* Determines pixel coordinates and offset for the current thread. + * Returns whether the thread should do any work. + * + * All coordinates are relative to the denoising buffer! + * + * Window is the rect that should be processed. + * co is filled with (x, y, dx, dy). + */ +ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride, + int4 *rect, int4 *co, int *ofs, + int4 window) +{ + /* Determine the pixel offset that this thread should apply. */ + int s = 2*r+1; + int si = ccl_global_id(1); + int sx = si % s; + int sy = si / s; + if(sy >= s) { + return false; + } + co->z = sx-r; + co->w = sy-r; + + /* Pixels still need to lie inside the denoising buffer after applying the offset, + * so determine the area for which this is the case. */ + *rect = make_int4(max(0, -co->z), max(0, -co->w), + w - max(0, co->z), h - max(0, co->w)); + + /* Find the intersection of the area that we want to process (window) and the area + * that can be processed (rect) to get the final area for this offset. */ + int4 clip_area = rect_clip(window, *rect); + + /* If the radius is larger than one of the sides of the window, + * there will be shifts for which there is no usable pixel at all. */ + if(!rect_is_valid(clip_area)) { + return false; + } + + /* Map the linear thread index to pixels inside the clip area. */ + int x, y; + if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { + return false; + } + co->x = x; + co->y = y; + + *ofs = (sy*s + sx) * stride; + + return true; +} + +ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride, + int4 *rect, int4 *co, int *ofs) +{ + return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); +} + ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, - ccl_global float ccl_restrict_ptr weightImage, - ccl_global float ccl_restrict_ptr varianceImage, - ccl_global float *differenceImage, - int4 rect, int w, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int4 rect, int stride, int channel_offset, float a, float k_2) { float diff = 0.0f; int numChannels = channel_offset? 3 : 1; for(int c = 0; c < numChannels; c++) { - float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)]; - float pvar = varianceImage[c*channel_offset + y*w+x]; - float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)]; + float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + float pvar = variance_image[c*channel_offset + y*stride + x]; + float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); } if(numChannels > 1) { diff *= 1.0f/numChannels; } - differenceImage[y*w+x] = diff; + difference_image[y*stride + x] = diff; } ccl_device_inline void kernel_filter_nlm_blur(int x, int y, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float *outImage, - int4 rect, int w, int f) + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int stride, int f) { float sum = 0.0f; const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); for(int y1 = low; y1 < high; y1++) { - sum += differenceImage[y1*w+x]; + sum += difference_image[y1*stride + x]; } sum *= 1.0f/(high-low); - outImage[y*w+x] = sum; + out_image[y*stride + x] = sum; } ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float *outImage, - int4 rect, int w, int f) + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); for(int x1 = low; x1 < high; x1++) { - sum += differenceImage[y*w+x1]; + sum += difference_image[y*stride + x1]; } sum *= 1.0f/(high-low); - outImage[y*w+x] = expf(-max(sum, 0.0f)); + out_image[y*stride + x] = fast_expf(-max(sum, 0.0f)); } ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr image, - ccl_global float *outImage, - ccl_global float *accumImage, - int4 rect, int w, int f) + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int4 rect, int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); for(int x1 = low; x1 < high; x1++) { - sum += differenceImage[y*w+x1]; + sum += difference_image[y*stride + x1]; } sum *= 1.0f/(high-low); - if(outImage) { - accumImage[y*w+x] += sum; - outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; + if(out_image) { + atomic_add_and_fetch_float(accum_image + y*stride + x, sum); + atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]); } else { - accumImage[y*w+x] = sum; + accum_image[y*stride + x] = sum; } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, +ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr buffer, - ccl_global float *color_pass, - ccl_global float *variance_pass, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, ccl_global float3 *XtWY, int4 rect, - int4 filter_rect, - int w, int h, int f, + int4 filter_window, + int stride, int f, int pass_stride, int localIdx) { - int y = fy + filter_rect.y; - int x = fx + filter_rect.x; const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); float sum = 0.0f; for(int x1 = low; x1 < high; x1++) { - sum += differenceImage[y*w+x1]; + sum += difference_image[y*stride + x1]; } float weight = sum * (1.0f/(high - low)); - int storage_ofs = fy*filter_rect.z + fx; + /* Reconstruction data is only stored for pixels inside the filter window, + * so compute the pixels's index in there. */ + int storage_ofs = coord_to_local_index(filter_window, x, y); transform += storage_ofs; rank += storage_ofs; XtWX += storage_ofs; XtWY += storage_ofs; kernel_filter_construct_gramian(x, y, - filter_rect.z*filter_rect.w, - dx, dy, w, h, + rect_size(filter_window), + dx, dy, + stride, pass_stride, buffer, - color_pass, variance_pass, transform, rank, weight, XtWX, XtWY, localIdx); } ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, - ccl_global float *outImage, - ccl_global float ccl_restrict_ptr accumImage, - int4 rect, int w) + ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int stride) { - outImage[y*w+x] /= accumImage[y*w+x]; + out_image[y*stride + x] /= accum_image[y*stride + x]; } CCL_NAMESPACE_END