Code refactor: use float4 instead of intrinsics for CPU denoise filtering.
authorBrecht Van Lommel <brechtvanlommel@gmail.com>
Tue, 18 Jul 2017 23:54:56 +0000 (01:54 +0200)
committerBrecht Van Lommel <brechtvanlommel@gmail.com>
Mon, 7 Aug 2017 12:01:24 +0000 (14:01 +0200)
Differential Revision: https://developer.blender.org/D2764

intern/cycles/kernel/filter/filter_features_sse.h
intern/cycles/kernel/filter/filter_nlm_cpu.h
intern/cycles/kernel/filter/filter_transform_sse.h
intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
intern/cycles/util/util_math_float4.h
intern/cycles/util/util_math_matrix.h
intern/cycles/util/util_simd.h

index 27e220923a0bd1ae66bb8b90d23fb11370f178f1..3ddd871226647e7b01cfbd2eb81ebf4e60453f8f 100644 (file)
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
@@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN
 
 #define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
                                  for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                     __m128 y4 = _mm_set1_ps(pixel.y); \
+                                     float4 y4 = make_float4(pixel.y); \
                                      for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
-                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                         int4 active_pixels = x4 < make_float4(high.x);
 
 #define END_FOR_PIXEL_WINDOW_SSE     } \
                                      pixel_buffer += buffer_w - (pixel.x - low.x); \
                                  }
 
-ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
-                                               __m128 active_pixels,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+                                               int4 active_pixels,
                                                const float *ccl_restrict buffer,
-                                               __m128 *features,
-                                               const __m128 *ccl_restrict mean,
+                                               float4 *features,
+                                               const float4 *ccl_restrict mean,
                                                int pass_stride)
 {
        features[0] = x;
        features[1] = y;
-       features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+       features[2] = fabs(ccl_get_feature_sse(0));
        features[3] = ccl_get_feature_sse(1);
        features[4] = ccl_get_feature_sse(2);
        features[5] = ccl_get_feature_sse(3);
@@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
        features[9] = ccl_get_feature_sse(7);
        if(mean) {
                for(int i = 0; i < DENOISE_FEATURES; i++)
-                       features[i] = _mm_sub_ps(features[i], mean[i]);
+                       features[i] = features[i] - mean[i];
        }
        for(int i = 0; i < DENOISE_FEATURES; i++)
-               features[i] = _mm_mask_ps(features[i], active_pixels);
+               features[i] = mask(active_pixels, features[i]);
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
-                                                     __m128 active_pixels,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+                                                     int4 active_pixels,
                                                      const float *ccl_restrict buffer,
-                                                     __m128 *scales,
-                                                     const __m128 *ccl_restrict mean,
+                                                     float4 *scales,
+                                                     const float4 *ccl_restrict mean,
                                                      int pass_stride)
 {
-       scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
-       scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
-
-       scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
-
-       __m128 diff, scale;
-       diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
-       scale = _mm_mul_ps(diff, diff);
-       diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
-       scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-       diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
-       scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-       scales[3] = _mm_mask_ps(scale, active_pixels);
-
-       scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
-
-       diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
-       scale = _mm_mul_ps(diff, diff);
-       diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
-       scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-       diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
-       scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-       scales[5] = _mm_mask_ps(scale, active_pixels);
+       scales[0] = fabs(x - mean[0]);
+       scales[1] = fabs(y - mean[1]);
+       scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+       scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+                   sqr(ccl_get_feature_sse(2) - mean[4]) +
+                   sqr(ccl_get_feature_sse(3) - mean[5]);
+       scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+       scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+                   sqr(ccl_get_feature_sse(6) - mean[8]) +
+                   sqr(ccl_get_feature_sse(7) - mean[9]);
+       for(int i = 0; i < 6; i++)
+               scales[i] = mask(active_pixels, scales[i]);
 }
 
-ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
 {
-       scale[0] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
-       scale[1] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
-       scale[2] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
-       scale[6] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
-
-       scale[7] = scale[8] = scale[9] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
-       scale[3] = scale[4] = scale[5] = _mm_div_ps(_mm_set1_ps(1.0f), _mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+       scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+       scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+       scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+       scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+       scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+       scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
 }
 
 
index 3e752bce68f01afd5bae510ad0a0b707aaee517c..5e989331bc2cacdca36f6f3db50a3f1b005ea897 100644 (file)
@@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                                               int w,
                                               int f)
 {
-#ifdef __KERNEL_SSE3__
-       int aligned_lowx = (rect.x & ~(3));
-       int aligned_highx = ((rect.z + 3) & ~(3));
-#endif
+       int aligned_lowx = rect.x / 4;
+       int aligned_highx = (rect.z + 3) / 4;
        for(int y = rect.y; y < rect.w; y++) {
                const int low = max(rect.y, y-f);
                const int high = min(rect.w, y+f+1);
@@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                        out_image[y*w+x] = 0.0f;
                }
                for(int y1 = low; y1 < high; y1++) {
-#ifdef __KERNEL_SSE3__
-                       for(int x = aligned_lowx; x < aligned_highx; x+=4) {
-                               _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+                       float4* out_image4 = (float4*)(out_image + y*w);
+                       float4* difference_image4 = (float4*)(difference_image + y1*w);
+                       for(int x = aligned_lowx; x < aligned_highx; x++) {
+                               out_image4[x] += difference_image4[x];
                        }
-#else
-                       for(int x = rect.x; x < rect.z; x++) {
-                               out_image[y*w+x] += difference_image[y1*w+x];
-                       }
-#endif
                }
                for(int x = rect.x; x < rect.z; x++) {
                        out_image[y*w+x] *= 1.0f/(high - low);
index 30dc2969b114f40fd3c41a360e9a5ad0ab4efaa9..9e65f61664b6544de50abec160084e6fa049f020 100644 (file)
@@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 {
        int buffer_w = align_up(rect.z - rect.x, 4);
 
-       __m128 features[DENOISE_FEATURES];
+       float4 features[DENOISE_FEATURES];
        const float *ccl_restrict pixel_buffer;
        int2 pixel;
 
@@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
                              min(rect.w, y + radius + 1));
        int num_pixels = (high.y - low.y) * (high.x - low.x);
 
-       __m128 feature_means[DENOISE_FEATURES];
+       float4 feature_means[DENOISE_FEATURES];
        math_vector_zero_sse(feature_means, DENOISE_FEATURES);
        FOR_PIXEL_WINDOW_SSE {
                filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
                math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
        } END_FOR_PIXEL_WINDOW_SSE
 
-       __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+       float4 pixel_scale = make_float4(1.0f / num_pixels);
        for(int i = 0; i < DENOISE_FEATURES; i++) {
-               feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+               feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
        }
 
-       __m128 feature_scale[DENOISE_FEATURES];
+       float4 feature_scale[DENOISE_FEATURES];
        math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
        FOR_PIXEL_WINDOW_SSE {
                filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
@@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        filter_calculate_scale_sse(feature_scale);
 
-       __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+       float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
        math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
        FOR_PIXEL_WINDOW_SSE {
                filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
                math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
-               math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+               math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
        } END_FOR_PIXEL_WINDOW_SSE
 
        float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
@@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        /* Bake the feature scaling into the transformation matrix. */
        for(int i = 0; i < DENOISE_FEATURES; i++) {
-               math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+               math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
        }
 }
 
index f7c9935f1d078c518ee7d0c92c452509b7f2bd6f..a13fb5cd4fb18d6d889fb953859b72817df6e8d5 100644 (file)
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
index 070b95a35053942cffc04a2ca4b3e77386ac549b..6b690adf0f50a6c2f2f40cba81cfb9d6fc012687 100644 (file)
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
index 1a7b2040da1f8ccf67180d49c9b1bc830f268234..254025be4e26fff4adc9fb9bf140106963383b3a 100644 (file)
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
index 007b3fc5082c3e379f67f33105d194125264894d..adb9a76a4349f46feeeaef7ae188c653ab29fe79 100644 (file)
@@ -52,7 +52,6 @@ ccl_device_inline float4 sqrt(const float4& a);
 ccl_device_inline float4 sqr(const float4& a);
 ccl_device_inline float4 cross(const float4& a, const float4& b);
 ccl_device_inline bool is_zero(const float4& a);
-ccl_device_inline float reduce_add(const float4& a);
 ccl_device_inline float average(const float4& a);
 ccl_device_inline float len(const float4& a);
 ccl_device_inline float4 normalize(const float4& a);
@@ -85,6 +84,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b);
 ccl_device_inline float4 reduce_min(const float4& a);
 ccl_device_inline float4 reduce_max(const float4& a);
+ccl_device_inline float4 reduce_add(const float4& a);
 #endif  /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -275,24 +275,24 @@ ccl_device_inline bool is_zero(const float4& a)
 #endif
 }
 
-ccl_device_inline float reduce_add(const float4& a)
+ccl_device_inline float4 reduce_add(const float4& a)
 {
 #ifdef __KERNEL_SSE__
 #  ifdef __KERNEL_SSE3__
     float4 h(_mm_hadd_ps(a.m128, a.m128));
-    return  _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+    return float4( _mm_hadd_ps(h.m128, h.m128));
 #  else
        float4 h(shuffle<1,0,3,2>(a) + a);
-       return  _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+       return  shuffle<2,3,0,1>(h) + h;
 #  endif
 #else
-       return ((a.x + a.y) + (a.z + a.w));
+       return make_float4(((a.x + a.y) + (a.z + a.w)));
 #endif
 }
 
 ccl_device_inline float average(const float4& a)
 {
-       return reduce_add(a) * 0.25f;
+       return reduce_add(a)[0] * 0.25f;
 }
 
 ccl_device_inline float len(const float4& a)
index c7511f8306e6d3a59fde2abbca934d2128e26a42..7269d391956780acc212fdb54882f20333045089 100644 (file)
@@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 }
 
 #ifdef __KERNEL_SSE3__
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
 {
        for(int i = 0; i < n; i++) {
-               A[i] = _mm_setzero_ps();
+               A[i] = make_float4(0.0f);
        }
 }
 
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
 {
        for(int row = 0; row < n; row++) {
                for(int col = 0; col <= row; col++) {
-                       MAT(A, n, row, col) = _mm_setzero_ps();
+                       MAT(A, n, row, col) = make_float4(0.0f);
                }
        }
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
 {
        for(int row = 0; row < n; row++) {
                for(int col = 0; col <= row; col++) {
-                       MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+                       MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
                }
        }
 }
 
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
        for(int i = 0; i < n; i++) {
-               V[i] = _mm_add_ps(V[i], a[i]);
+               V[i] += a[i];
        }
 }
 
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
        for(int i = 0; i < n; i++) {
-               V[i] = _mm_mul_ps(V[i], a[i]);
+               V[i] *= a[i];
        }
 }
 
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
 {
        for(int i = 0; i < n; i++) {
-               a[i] = _mm_max_ps(a[i], b[i]);
+               a[i] = max(a[i], b[i]);
        }
 }
 
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
 {
        for(int row = 0; row < n; row++) {
                for(int col = 0; col <= row; col++) {
-                       MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+                       MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
                }
        }
 }
index 66dd80420ae9c81a3ddc9b626a8bb58932732418..a2b3247b20769884adf9abd9cfb1ec5d129c993d 100644 (file)
@@ -417,39 +417,6 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
 
 #endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
-#undef _mm_fabs_ps
-#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
-
-/* Return a __m128 with every element set to the largest element of v. */
-ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
-{
-  /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
-  v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
-  /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
-  v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
-  return v;
-}
-
-/* Return the sum of the four elements of x. */
-ccl_device_inline float _mm_hsum_ss(__m128 x)
-{
-    __m128 a = _mm_movehdup_ps(x);
-    __m128 b = _mm_add_ps(x, a);
-    return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
-}
-
-/* Return a __m128 with every element set to the sum of the four elements of x. */
-ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
-{
-    x = _mm_hadd_ps(x, x);
-    x = _mm_hadd_ps(x, x);
-    return x;
-}
-
-/* Replace elements of x with zero where mask isn't set. */
-#undef _mm_mask_ps
-#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
-
 #else  /* __KERNEL_SSE2__ */
 
 /* This section is for utility functions which operates on non-register data