+/* Determines pixel coordinates and offset for the current thread.
+ * Returns whether the thread should do any work.
+ *
+ * All coordinates are relative to the denoising buffer!
+ *
+ * Window is the rect that should be processed.
+ * co is filled with (x, y, dx, dy).
+ */
+ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
+ int4 *rect, int4 *co, int *ofs,
+ int4 window)
+{
+ /* Determine the pixel offset that this thread should apply. */
+ int s = 2*r+1;
+ int si = ccl_global_id(1);
+ int sx = si % s;
+ int sy = si / s;
+ if(sy >= s) {
+ return false;
+ }
+ co->z = sx-r;
+ co->w = sy-r;
+
+ /* Pixels still need to lie inside the denoising buffer after applying the offset,
+ * so determine the area for which this is the case. */
+ *rect = make_int4(max(0, -co->z), max(0, -co->w),
+ w - max(0, co->z), h - max(0, co->w));
+
+ /* Find the intersection of the area that we want to process (window) and the area
+ * that can be processed (rect) to get the final area for this offset. */
+ int4 clip_area = rect_clip(window, *rect);
+
+ /* If the radius is larger than one of the sides of the window,
+ * there will be shifts for which there is no usable pixel at all. */
+ if(!rect_is_valid(clip_area)) {
+ return false;
+ }
+
+ /* Map the linear thread index to pixels inside the clip area. */
+ int x, y;
+ if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
+ return false;
+ }
+ co->x = x;
+ co->y = y;
+
+ *ofs = (sy*s + sx) * stride;
+
+ return true;
+}
+
+ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
+ int4 *rect, int4 *co, int *ofs)
+{
+ return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
+}
+