Optimize linear<->sRGB conversion for SSE2 processors
authorSergey Sharybin <sergey.vfx@gmail.com>
Thu, 5 May 2016 17:25:08 +0000 (19:25 +0200)
committerSergey Sharybin <sergey.vfx@gmail.com>
Thu, 5 May 2016 17:46:06 +0000 (19:46 +0200)
Using SSE2 intrinsics when available for this kind of conversions.

It's not totally accurate, but accurate enough for the purposes where
we're using direct colorspace conversion by-passing OCIO.

Partially based on code from Cycles, partially based on other online
articles:

  https://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent

Makes projection painting on hi-res float textures smoother.

This commit also enables global SSE2 in Blender. It shouldn't
bring any regressions in supported hardware (we require SSE2 since
2.64 now), but should keep an eye on because compilers might have
some bugs with that (unlikely, but possible).

CMakeLists.txt
source/blender/blenlib/intern/math_base_inline.c
source/blender/blenlib/intern/math_color_inline.c
source/blender/compositor/CMakeLists.txt

index 438cb1cdd9f1f8f40e0737dc19459e6fa89e3f8e..6fc4ca2729d701e8b95e206cec0614836e8ca408 100644 (file)
@@ -2357,17 +2357,17 @@ endif()
 
 # See TEST_SSE_SUPPORT() for how this is defined.
 
-if(WITH_RAYOPTIMIZATION)
-       if(SUPPORT_SSE_BUILD)
-               set(PLATFORM_CFLAGS " ${COMPILER_SSE_FLAG} ${PLATFORM_CFLAGS}")
-               add_definitions(-D__SSE__ -D__MMX__)
-       endif()
-       if(SUPPORT_SSE2_BUILD)
-               set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}")
-               add_definitions(-D__SSE2__)
-               if(NOT SUPPORT_SSE_BUILD) # dont double up
-                       add_definitions(-D__MMX__)
-               endif()
+# Do it globally, SSE2 is required for quite some time now.
+# Doing it now allows to use SSE/SSE2 in inline headers.
+if(SUPPORT_SSE_BUILD)
+       set(PLATFORM_CFLAGS " ${COMPILER_SSE_FLAG} ${PLATFORM_CFLAGS}")
+       add_definitions(-D__SSE__ -D__MMX__)
+endif()
+if(SUPPORT_SSE2_BUILD)
+       set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}")
+       add_definitions(-D__SSE2__)
+       if(NOT SUPPORT_SSE_BUILD) # dont double up
+               add_definitions(-D__MMX__)
        endif()
 endif()
 
index f70c12bd26e0fe3f5b695d7f7505c6f88bf8244a..7ab2b7848121c5e6bad4b403b4a9efddfd40fa7d 100644 (file)
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef __SSE2__
+#  include <emmintrin.h>
+#endif
+
 #include "BLI_math_base.h"
 
 /* copied from BLI_utildefines.h */
@@ -311,4 +315,98 @@ MINLINE int signum_i(float a)
        else          return  0;
 }
 
+/* Internal helpers for SSE2 implementation.
+ *
+ * NOTE: Are to be called ONLY from inside `#ifdef __SSE2__` !!!
+ */
+
+#ifdef __SSE2__
+
+/* Calculate initial guess for arg^exp based on float representation
+ * This method gives a constant bias, which can be easily compensated by
+ * multiplicating with bias_coeff.
+ * Gives better results for exponents near 1 (e. g. 4/5).
+ * exp = exponent, encoded as uint32_t
+ * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as
+ * uint32_t
+ *
+ * We hope that exp and e2coeff gets properly inlined
+ */
+MALWAYS_INLINE __m128 _bli_math_fastpow(const int exp,
+                                        const int e2coeff,
+                                        const __m128 arg)
+{
+       __m128 ret;
+       ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
+       ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
+       ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
+       ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
+       return ret;
+}
+
+/* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
+MALWAYS_INLINE __m128 _bli_math_improve_5throot_solution(
+    const __m128 old_result,
+    const __m128 x)
+{
+       __m128 approx2 = _mm_mul_ps(old_result, old_result);
+       __m128 approx4 = _mm_mul_ps(approx2, approx2);
+       __m128 t = _mm_div_ps(x, approx4);
+       __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
+       return _mm_mul_ps(summ, _mm_set1_ps(1.0f/5.0f));
+}
+
+/* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
+MALWAYS_INLINE __m128 _bli_math_fastpow24(const __m128 arg)
+{
+       /* max, avg and |avg| errors were calculated in gcc without FMA instructions
+        * The final precision should be better than powf in glibc */
+
+       /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize
+        * avg error.
+        */
+       /* 0x3F4CCCCD = 4/5 */
+       /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
+       /* error max = 0.17     avg = 0.0018    |avg| = 0.05 */
+       __m128 x = _bli_math_fastpow(0x3F4CCCCD, 0x4F55A7FB, arg);
+       __m128 arg2 = _mm_mul_ps(arg, arg);
+       __m128 arg4 = _mm_mul_ps(arg2, arg2);
+       /* error max = 0.018        avg = 0.0031    |avg| = 0.0031  */
+       x = _bli_math_improve_5throot_solution(x, arg4);
+       /* error max = 0.00021    avg = 1.6e-05    |avg| = 1.6e-05 */
+       x = _bli_math_improve_5throot_solution(x, arg4);
+       /* error max = 6.1e-07    avg = 5.2e-08    |avg| = 1.1e-07 */
+       x = _bli_math_improve_5throot_solution(x, arg4);
+       return _mm_mul_ps(x, _mm_mul_ps(x, x));
+}
+
+/* Calculate powf(x, 1.0f / 2.4) */
+MALWAYS_INLINE __m128 _bli_math_fastpow512(const __m128 arg)
+{
+       /* 5/12 is too small, so compute the 4th root of 20/12 instead.
+        * 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
+        * weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
+        */
+       __m128 xf = _bli_math_fastpow(0x3f2aaaab, 0x5eb504f3, arg);
+       __m128 xover = _mm_mul_ps(arg, xf);
+       __m128 xfm1 = _mm_rsqrt_ps(xf);
+       __m128 x2 = _mm_mul_ps(arg, arg);
+       __m128 xunder = _mm_mul_ps(x2, xfm1);
+       /* sqrt2 * over + 2 * sqrt2 * under */
+       __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
+                                _mm_add_ps(xover, xunder));
+       xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+       xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+       return xavg;
+}
+
+MALWAYS_INLINE __m128 _bli_math_blend_sse(const __m128 mask,
+                                          const __m128 a,
+                                          const __m128 b)
+{
+       return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
+}
+
+#endif  /* __SSE2__ */
+
 #endif /* __MATH_BASE_INLINE_C__ */
index 45466226e72f087993fb451fe71d38f55f68045a..c268481a5d9ac203d3be9c14635ac3c1a578b835 100644 (file)
@@ -28,6 +28,7 @@
  */
 
 
+#include "BLI_math_base.h"
 #include "BLI_math_color.h"
 #include "BLI_utildefines.h"
 
 
 /******************************** Color Space ********************************/
 
+#ifdef __SSE2__
+
+MALWAYS_INLINE __m128 srgb_to_linearrgb_v4_simd(const __m128 c)
+{
+       __m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.04045f));
+       __m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(1.0f/12.92f)),
+                              _mm_set1_ps(0.0f));
+       __m128 gtebase = _mm_mul_ps(_mm_add_ps(c, _mm_set1_ps(0.055f)),
+                                   _mm_set1_ps(1.0f/1.055f)); /* fma */
+       __m128 gte = _bli_math_fastpow24(gtebase);
+       return _bli_math_blend_sse(cmp, lt, gte);
+}
+
+MALWAYS_INLINE __m128 linearrgb_to_srgb_v4_simd(const __m128 c)
+{
+       __m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.0031308f));
+       __m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(12.92f)),
+                              _mm_set1_ps(0.0f));
+       __m128 gte = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.055f),
+                                          _bli_math_fastpow512(c)),
+                               _mm_set1_ps(-0.055f));
+       return _bli_math_blend_sse(cmp, lt, gte);
+}
+
+MINLINE void srgb_to_linearrgb_v3_v3(float linear[3], const float srgb[3])
+{
+       float r[4] = {srgb[0], srgb[1], srgb[2], 0.0f};
+       __m128 *rv = (__m128 *)&r;
+       *rv = srgb_to_linearrgb_v4_simd(*rv);
+       linear[0] = r[0];
+       linear[1] = r[1];
+       linear[2] = r[2];
+}
+
+MINLINE void linearrgb_to_srgb_v3_v3(float srgb[3], const float linear[3])
+{
+       float r[4] = {linear[0], linear[1], linear[2], 0.0f};
+       __m128 *rv = (__m128 *)&r;
+       *rv = linearrgb_to_srgb_v4_simd(*rv);
+       srgb[0] = r[0];
+       srgb[1] = r[1];
+       srgb[2] = r[2];
+}
+
+#else  /* __SSE2__ */
+
 MINLINE void srgb_to_linearrgb_v3_v3(float linear[3], const float srgb[3])
 {
        linear[0] = srgb_to_linearrgb(srgb[0]);
@@ -51,6 +98,7 @@ MINLINE void linearrgb_to_srgb_v3_v3(float srgb[3], const float linear[3])
        srgb[1] = linearrgb_to_srgb(linear[1]);
        srgb[2] = linearrgb_to_srgb(linear[2]);
 }
+#endif  /* __SSE2__ */
 
 MINLINE void srgb_to_linearrgb_v4(float linear[4], const float srgb[4])
 {
@@ -98,10 +146,14 @@ MINLINE void srgb_to_linearrgb_predivide_v4(float linear[4], const float srgb[4]
                inv_alpha = 1.0f / alpha;
        }
 
-       linear[0] = srgb_to_linearrgb(srgb[0] * inv_alpha) * alpha;
-       linear[1] = srgb_to_linearrgb(srgb[1] * inv_alpha) * alpha;
-       linear[2] = srgb_to_linearrgb(srgb[2] * inv_alpha) * alpha;
+       linear[0] = srgb[0] * inv_alpha;
+       linear[1] = srgb[1] * inv_alpha;
+       linear[2] = srgb[2] * inv_alpha;
        linear[3] = srgb[3];
+       srgb_to_linearrgb_v3_v3(linear, linear);
+       linear[0] *= alpha;
+       linear[1] *= alpha;
+       linear[2] *= alpha;
 }
 
 MINLINE void linearrgb_to_srgb_predivide_v4(float srgb[4], const float linear[4])
@@ -117,10 +169,14 @@ MINLINE void linearrgb_to_srgb_predivide_v4(float srgb[4], const float linear[4]
                inv_alpha = 1.0f / alpha;
        }
 
-       srgb[0] = linearrgb_to_srgb(linear[0] * inv_alpha) * alpha;
-       srgb[1] = linearrgb_to_srgb(linear[1] * inv_alpha) * alpha;
-       srgb[2] = linearrgb_to_srgb(linear[2] * inv_alpha) * alpha;
+       srgb[0] = linear[0] * inv_alpha;
+       srgb[1] = linear[1] * inv_alpha;
+       srgb[2] = linear[2] * inv_alpha;
        srgb[3] = linear[3];
+       linearrgb_to_srgb_v3_v3(srgb, srgb);
+       srgb[0] *= alpha;
+       srgb[1] *= alpha;
+       srgb[2] *= alpha;
 }
 
 /* LUT accelerated conversions */
index 35e5ec98e57969e9e94cf6f2873c4511e8f56440..3180e7e41540bfac24a58376cf2bb8cbebdb02fb 100644 (file)
@@ -542,11 +542,6 @@ list(APPEND INC
        ${CMAKE_CURRENT_BINARY_DIR}/operations
 )
 
-if(MSVC AND NOT CMAKE_CL_64)
-       set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-endif()
-
 data_to_c(${CMAKE_CURRENT_SOURCE_DIR}/operations/COM_OpenCLKernels.cl
           ${CMAKE_CURRENT_BINARY_DIR}/operations/COM_OpenCLKernels.cl.h SRC)