Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / filter / filter_nlm_cpu.h
index 5e989331bc2cacdca36f6f3db50a3f1b005ea897..e2da0fd872bce418731e78ab9face67bb3b57b28 100644 (file)
@@ -21,7 +21,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          const float *ccl_restrict variance_image,
                                                          float *difference_image,
                                                          int4 rect,
-                                                         int w,
+                                                         int stride,
                                                          int channel_offset,
                                                          float a,
                                                          float k_2)
@@ -31,15 +31,15 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                        float diff = 0.0f;
                        int numChannels = channel_offset? 3 : 1;
                        for(int c = 0; c < numChannels; c++) {
-                               float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
-                               float pvar = variance_image[c*channel_offset + y*w+x];
-                               float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+                               float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+                               float pvar = variance_image[c*channel_offset + y*stride + x];
+                               float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
                                diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
                        }
                        if(numChannels > 1) {
                                diff *= 1.0f/numChannels;
                        }
-                       difference_image[y*w+x] = diff;
+                       difference_image[y*stride + x] = diff;
                }
        }
 }
@@ -47,7 +47,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
 ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
                                               float *out_image,
                                               int4 rect,
-                                              int w,
+                                              int stride,
                                               int f)
 {
        int aligned_lowx = rect.x / 4;
@@ -56,17 +56,17 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                const int low = max(rect.y, y-f);
                const int high = min(rect.w, y+f+1);
                for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*w+x] = 0.0f;
+                       out_image[y*stride + x] = 0.0f;
                }
                for(int y1 = low; y1 < high; y1++) {
-                       float4* out_image4 = (float4*)(out_image + y*w);
-                       float4* difference_image4 = (float4*)(difference_image + y1*w);
+                       float4* out_image4 = (float4*)(out_image + y*stride);
+                       float4* difference_image4 = (float4*)(difference_image + y1*stride);
                        for(int x = aligned_lowx; x < aligned_highx; x++) {
                                out_image4[x] += difference_image4[x];
                        }
                }
                for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*w+x] *= 1.0f/(high - low);
+                       out_image[y*stride + x] *= 1.0f/(high - low);
                }
        }
 }
@@ -74,12 +74,12 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
 ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
                                                      float *out_image,
                                                      int4 rect,
-                                                     int w,
+                                                     int stride,
                                                      int f)
 {
        for(int y = rect.y; y < rect.w; y++) {
                for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*w+x] = 0.0f;
+                       out_image[y*stride + x] = 0.0f;
                }
        }
        for(int dx = -f; dx <= f; dx++) {
@@ -87,7 +87,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
                int neg_dx = min(0, dx);
                for(int y = rect.y; y < rect.w; y++) {
                        for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
-                               out_image[y*w+x] += difference_image[y*w+dx+x];
+                               out_image[y*stride + x] += difference_image[y*stride + x+dx];
                        }
                }
        }
@@ -95,7 +95,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
                for(int x = rect.x; x < rect.z; x++) {
                        const int low = max(rect.x, x-f);
                        const int high = min(rect.z, x+f+1);
-                       out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+                       out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
                }
        }
 }
@@ -106,7 +106,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                                                        float *out_image,
                                                        float *accum_image,
                                                        int4 rect,
-                                                       int w,
+                                                       int stride,
                                                        int f)
 {
        for(int y = rect.y; y < rect.w; y++) {
@@ -115,11 +115,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                        const int high = min(rect.z, x+f+1);
                        float sum = 0.0f;
                        for(int x1 = low; x1 < high; x1++) {
-                               sum += difference_image[y*w+x1];
+                               sum += difference_image[y*stride + x1];
                        }
                        float weight = sum * (1.0f/(high - low));
-                       accum_image[y*w+x] += weight;
-                       out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+                       accum_image[y*stride + x] += weight;
+                       out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
                }
        }
 }
@@ -132,31 +132,31 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                                                            float *XtWX,
                                                            float3 *XtWY,
                                                            int4 rect,
-                                                           int4 filter_rect,
-                                                           int w, int h, int f,
+                                                           int4 filter_window,
+                                                           int stride, int f,
                                                            int pass_stride)
 {
+       int4 clip_area = rect_clip(rect, filter_window);
        /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
-       for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
-               int y = fy + filter_rect.y;
-               for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
-                       int x = fx + filter_rect.x;
+       for(int y = clip_area.y; y < clip_area.w; y++) {
+               for(int x = clip_area.x; x < clip_area.z; x++) {
                        const int low = max(rect.x, x-f);
                        const int high = min(rect.z, x+f+1);
                        float sum = 0.0f;
                        for(int x1 = low; x1 < high; x1++) {
-                               sum += difference_image[y*w+x1];
+                               sum += difference_image[y*stride + x1];
                        }
                        float weight = sum * (1.0f/(high - low));
 
-                       int storage_ofs = fy*filter_rect.z + fx;
+                       int storage_ofs = coord_to_local_index(filter_window, x, y);
                        float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
                        float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
                        float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
                        int    *l_rank = rank + storage_ofs;
 
                        kernel_filter_construct_gramian(x, y, 1,
-                                                       dx, dy, w, h,
+                                                       dx, dy,
+                                                       stride,
                                                        pass_stride,
                                                        buffer,
                                                        l_transform, l_rank,