Use advantage of SSE2 instructions in gaussian blur node
authorSergey Sharybin <sergey.vfx@gmail.com>
Fri, 13 Jun 2014 18:30:13 +0000 (00:30 +0600)
committerSergey Sharybin <sergey.vfx@gmail.com>
Fri, 13 Jun 2014 18:38:07 +0000 (00:38 +0600)
This gives around 30% of speedup for gaussian blur node.

Pretty much straightforward implementation inside the node
itself, but needed to implement some additional things:

- Aligned malloc. It's needed to load data onto SSE registers
  faster. based on the aligned_malloc() from Libmv with
  some additional trickery going on to support arbitrary
  alignment (this magic is needed because of MemHead).

  In the practice only 16bit alignment is supported because
  of the lack of aligned malloc with arbitrary alignment
  for OSX. Not a bit deal for now because we need 16 bytes
  alignment at this moment only. Could be tweaked further
  later.

- Memory buffers in compositor are now aligned to 16 bytes.
  Should be harmless for non-SSE cases too. just mentioning.

Reviewers: campbellbarton, lukastoenne, jbakker

Reviewed By: campbellbarton

CC: lockal
Differential Revision: https://developer.blender.org/D564

12 files changed:
intern/guardedalloc/MEM_guardedalloc.h
intern/guardedalloc/intern/mallocn.c
intern/guardedalloc/intern/mallocn_guarded_impl.c
intern/guardedalloc/intern/mallocn_intern.h
intern/guardedalloc/intern/mallocn_lockfree_impl.c
source/blender/compositor/intern/COM_MemoryBuffer.cpp
source/blender/compositor/operations/COM_BlurBaseOperation.cpp
source/blender/compositor/operations/COM_BlurBaseOperation.h
source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
source/blender/compositor/operations/COM_GaussianXBlurOperation.h
source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp
source/blender/compositor/operations/COM_GaussianYBlurOperation.h

index 4fb6896533842c3f8d7237a18194875d8ef5edb5..8c5ad77b8b6bfe0f7254a6cae77ce7bdbf47bb17 100644 (file)
@@ -119,6 +119,12 @@ extern "C" {
         * */
        extern void *(*MEM_mallocN)(size_t len, const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
 
+       /**
+        * Allocate an aligned block of memory of size len, with tag name str. The
+        * name must be a static, because only a pointer to it is stored !
+        * */
+       extern void *(*MEM_mallocN_aligned)(size_t len, size_t alignment, const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3);
+
        /**
         * Same as callocN, clears memory and uses mmap (disk cached) if supported.
         * Can be free'd with MEM_freeN as usual.
index e85fba7a6d07dad7ecfa593cac638f4d1100dd14..b0d252cca1453f16f51fba5014356506b12a8bda 100644 (file)
@@ -41,6 +41,7 @@ void *(*MEM_reallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfre
 void *(*MEM_recallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_recallocN_id;
 void *(*MEM_callocN)(size_t len, const char *str) = MEM_lockfree_callocN;
 void *(*MEM_mallocN)(size_t len, const char *str) = MEM_lockfree_mallocN;
+void *(*MEM_mallocN_aligned)(size_t len, size_t alignment, const char *str) = MEM_lockfree_mallocN_aligned;
 void *(*MEM_mapallocN)(size_t len, const char *str) = MEM_lockfree_mapallocN;
 void (*MEM_printmemlist_pydict)(void) = MEM_lockfree_printmemlist_pydict;
 void (*MEM_printmemlist)(void) = MEM_lockfree_printmemlist;
@@ -60,6 +61,40 @@ uintptr_t (*MEM_get_peak_memory)(void) = MEM_lockfree_get_peak_memory;
 const char *(*MEM_name_ptr)(void *vmemh) = MEM_lockfree_name_ptr;
 #endif
 
+void *aligned_malloc(size_t size, size_t alignment)
+{
+#ifdef _WIN32
+       return _aligned_malloc(size, alignment);
+#elif defined(__APPLE__)
+       /* On Mac OS X, both the heap and the stack are guaranteed 16-byte aligned so
+        * they work natively with SSE types with no further work.
+        */
+       assert(alignment == 16);
+       return malloc(size);
+#elif defined(__FreeBSD__) || defined(__NetBSD__)
+       void *result;
+
+       if (posix_memalign(&result, alignment, size)) {
+               /* non-zero means allocation error
+                * either no allocation or bad alignment value
+                */
+               return NULL;
+       }
+       return result;
+#else  /* This is for Linux. */
+       return memalign(alignment, size);
+#endif
+}
+
+void aligned_free(void *ptr)
+{
+#ifdef _WIN32
+       _aligned_free(ptr);
+#else
+       free(ptr);
+#endif
+}
+
 void MEM_use_guarded_allocator(void)
 {
        MEM_allocN_len = MEM_guarded_allocN_len;
@@ -69,6 +104,7 @@ void MEM_use_guarded_allocator(void)
        MEM_recallocN_id = MEM_guarded_recallocN_id;
        MEM_callocN = MEM_guarded_callocN;
        MEM_mallocN = MEM_guarded_mallocN;
+       MEM_mallocN_aligned = MEM_guarded_mallocN_aligned;
        MEM_mapallocN = MEM_guarded_mapallocN;
        MEM_printmemlist_pydict = MEM_guarded_printmemlist_pydict;
        MEM_printmemlist = MEM_guarded_printmemlist;
index 172c79d50cd33ec76d09f1ad22fcabb70abfd992..206390e071093e0f8bb450d91a55cf87cfa78e0f 100644 (file)
@@ -113,7 +113,10 @@ typedef struct MemHead {
        const char *name;
        const char *nextname;
        int tag2;
-       int mmap;  /* if true, memory was mmapped */
+       short mmap;  /* if true, memory was mmapped */
+       short alignment;  /* if non-zero aligned alloc was used
+                          * and alignment is stored here.
+                          */
 #ifdef DEBUG_MEMCOUNTER
        int _count;
 #endif
@@ -128,6 +131,8 @@ typedef struct MemHead {
 #endif
 } MemHead;
 
+typedef MemHead MemHeadAligned;
+
 /* for openmp threading asserts, saves time troubleshooting
  * we may need to extend this if blender code starts using MEM_
  * functions inside OpenMP correctly with omp_set_lock() */
@@ -187,7 +192,7 @@ static const char *check_memlist(MemHead *memh);
 
 #define MEMNEXT(x) \
        ((MemHead *)(((char *) x) - ((char *) &(((MemHead *)0)->next))))
-       
+
 /* --------------------------------------------------------------------- */
 /* vars                                                                  */
 /* --------------------------------------------------------------------- */
@@ -325,10 +330,12 @@ void *MEM_guarded_dupallocN(const void *vmemh)
                memh--;
 
 #ifndef DEBUG_MEMDUPLINAME
-               if (memh->mmap)
+               if (UNLIKELY(memh->mmap))
+                       newp = MEM_guarded_mapallocN(memh->len, "dupli_mapalloc");
+               else if (LIKELY(memh->alignment == 0))
                        newp = MEM_guarded_mapallocN(memh->len, "dupli_mapalloc");
                else
-                       newp = MEM_guarded_mallocN(memh->len, "dupli_alloc");
+                       newp = MEM_guarded_mallocN_aligned(memh->len, (size_t) memh->alignment, "dupli_alloc");
 
                if (newp == NULL) return NULL;
 #else
@@ -336,14 +343,18 @@ void *MEM_guarded_dupallocN(const void *vmemh)
                        MemHead *nmemh;
                        char *name = malloc(strlen(memh->name) + 24);
 
-                       if (memh->mmap) {
+                       if (UNLIKELY(memh->mmap)) {
                                sprintf(name, "%s %s", "dupli_mapalloc", memh->name);
                                newp = MEM_guarded_mapallocN(memh->len, name);
                        }
-                       else {
+                       else if (LIKELY(memh->alignment == 0)) {
                                sprintf(name, "%s %s", "dupli_alloc", memh->name);
                                newp = MEM_guarded_mallocN(memh->len, name);
                        }
+                       else {
+                               sprintf(name, "%s %s", "dupli_alloc", memh->name);
+                               newp = MEM_guarded_mallocN_aligned(memh->len, (size_t) memh->alignment, name);
+                       }
 
                        if (newp == NULL) return NULL;
 
@@ -368,7 +379,13 @@ void *MEM_guarded_reallocN_id(void *vmemh, size_t len, const char *str)
                MemHead *memh = vmemh;
                memh--;
 
-               newp = MEM_guarded_mallocN(len, memh->name);
+               if (LIKELY(memh->alignment == 0)) {
+                       newp = MEM_guarded_mallocN(len, memh->name);
+               }
+               else {
+                       newp = MEM_guarded_mallocN_aligned(len, (size_t) memh->alignment, memh->name);
+               }
+
                if (newp) {
                        if (len < memh->len) {
                                /* shrink */
@@ -397,7 +414,13 @@ void *MEM_guarded_recallocN_id(void *vmemh, size_t len, const char *str)
                MemHead *memh = vmemh;
                memh--;
 
-               newp = MEM_guarded_mallocN(len, memh->name);
+               if (LIKELY(memh->alignment == 0)) {
+                       newp = MEM_guarded_mallocN(len, memh->name);
+               }
+               else {
+                       newp = MEM_guarded_mallocN_aligned(len, (size_t) memh->alignment, memh->name);
+               }
+
                if (newp) {
                        if (len < memh->len) {
                                /* shrink */
@@ -464,6 +487,7 @@ static void make_memhead_header(MemHead *memh, size_t len, const char *str)
        memh->nextname = NULL;
        memh->len = len;
        memh->mmap = 0;
+       memh->alignment = 0;
        memh->tag2 = MEMTAG2;
 
 #ifdef DEBUG_MEMDUPLINAME
@@ -514,6 +538,54 @@ void *MEM_guarded_mallocN(size_t len, const char *str)
        return NULL;
 }
 
+void *MEM_guarded_mallocN_aligned(size_t len, size_t alignment, const char *str)
+{
+       MemHead *memh;
+
+       /* It's possible that MemHead's size is not properly aligned,
+        * do extra padding to deal with this.
+        *
+        * We only support small alignments which fits into short in
+        * order to save some bits in MemHead structure.
+        */
+       short extra_padding = (short)MEMHEAD_ALIGN_PADDING(alignment);
+
+       /* Huge alignment values doesn't make sense and they
+        * wouldn't fit into 'short' used in the MemHead.
+        */
+       assert(alignment < 1024);
+
+       /* We only support alignment to a power of two. */
+       assert(IS_POW2(alignment));
+
+       len = SIZET_ALIGN_4(len);
+
+       memh = (MemHead *)aligned_malloc(len + (size_t)extra_padding + sizeof(MemHead) + sizeof(MemTail), alignment);
+
+       if (LIKELY(memh)) {
+               /* We keep padding in the beginning of MemHead,
+                * this way it's always possible to get MemHead
+                * from the data pointer.
+                */
+               memh = (MemHead *)((char *)memh + extra_padding);
+
+               make_memhead_header(memh, len, str);
+               memh->alignment = (short) alignment;
+               if (UNLIKELY(malloc_debug_memset && len))
+                       memset(memh + 1, 255, len);
+
+#ifdef DEBUG_MEMCOUNTER
+               if (_mallocn_count == DEBUG_MEMCOUNTER_ERROR_VAL)
+                       memcount_raise(__func__);
+               memh->_count = _mallocn_count++;
+#endif
+               return (++memh);
+       }
+       print_error("aligned_malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
+                   SIZET_ARG(len), str, (unsigned int) mem_in_use);
+       return NULL;
+}
+
 void *MEM_guarded_callocN(size_t len, const char *str)
 {
        MemHead *memh;
@@ -953,7 +1025,12 @@ static void rem_memblock(MemHead *memh)
        else {
                if (UNLIKELY(malloc_debug_memset && memh->len))
                        memset(memh + 1, 255, memh->len);
-               free(memh);
+               if (LIKELY(memh->alignment == 0)) {
+                       free(memh);
+               }
+               else {
+                       aligned_free(MEMHEAD_REAL_PTR(memh));
+               }
        }
 }
 
index 7c8922dd407330680bafb86d3389e0a760f2a23b..a69bcf3d27b2ad123355f8a86a56cf66b4b7cd66 100644 (file)
 #  define UNLIKELY(x)     (x)
 #endif
 
+#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__NetBSD__)
+// Needed for memalign on Linux and _aligned_alloc on Windows.
+#  ifdef FREE_WINDOWS
+/* make sure _aligned_malloc is included */
+#    ifdef __MSVCRT_VERSION__
+#      undef __MSVCRT_VERSION__
+#    endif
+
+#    define __MSVCRT_VERSION__ 0x0700
+#  endif  // FREE_WINDOWS
+
+#  include <malloc.h>
+#else
+// Apple's malloc is 16-byte aligned, and does not have malloc.h, so include
+// stdilb instead.
+#  include <cstdlib>
+#endif
+
+#define IS_POW2(a) (((a) & ((a) - 1)) == 0)
+
+/* Extra padding which needs to be applied on MemHead to make it aligned. */
+#define MEMHEAD_ALIGN_PADDING(alignment) ((size_t)alignment - (sizeof(MemHeadAligned) % (size_t)alignment))
+
+/* Real pointer returned by the malloc or aligned_alloc. */
+#define MEMHEAD_REAL_PTR(memh) ((char *)memh - MEMHEAD_ALIGN_PADDING(memh->alignment))
+
+void *aligned_malloc(size_t size, size_t alignment);
+void aligned_free(void *ptr);
+
 /* Prototypes for counted allocator functions */
 size_t MEM_lockfree_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT;
 void MEM_lockfree_freeN(void *vmemh);
@@ -93,6 +122,7 @@ void *MEM_lockfree_reallocN_id(void *vmemh, size_t len, const char *UNUSED(str))
 void *MEM_lockfree_recallocN_id(void *vmemh, size_t len, const char *UNUSED(str))  ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(2);
 void *MEM_lockfree_callocN(size_t len, const char *UNUSED(str))  ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
 void *MEM_lockfree_mallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
+void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3);
 void *MEM_lockfree_mapallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
 void MEM_lockfree_printmemlist_pydict(void);
 void MEM_lockfree_printmemlist(void);
@@ -119,6 +149,7 @@ void *MEM_guarded_reallocN_id(void *vmemh, size_t len, const char *UNUSED(str))
 void *MEM_guarded_recallocN_id(void *vmemh, size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(2);
 void *MEM_guarded_callocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
 void *MEM_guarded_mallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
+void *MEM_guarded_mallocN_aligned(size_t len, size_t alignment, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3);
 void *MEM_guarded_mapallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
 void MEM_guarded_printmemlist_pydict(void);
 void MEM_guarded_printmemlist(void);
index 6fc01807af34bf62e44d6832585d6b606cf1ec46..c76caff0d74e56024128d792606bca3210fb3bae 100644 (file)
@@ -46,6 +46,11 @@ typedef struct MemHead {
        size_t len;
 } MemHead;
 
+typedef struct MemHeadAligned {
+       short alignment;
+       size_t len;
+} MemHeadAligned;
+
 static unsigned int totblock = 0;
 static size_t mem_in_use = 0, mmap_in_use = 0, peak_mem = 0;
 static bool malloc_debug_memset = false;
@@ -54,9 +59,17 @@ static void (*error_callback)(const char *) = NULL;
 static void (*thread_lock_callback)(void) = NULL;
 static void (*thread_unlock_callback)(void) = NULL;
 
+enum {
+       MEMHEAD_MMAP_FLAG = 1,
+       MEMHEAD_ALIGN_FLAG = 2,
+};
+
 #define MEMHEAD_FROM_PTR(ptr) (((MemHead*) vmemh) - 1)
 #define PTR_FROM_MEMHEAD(memhead) (memhead + 1)
-#define MEMHEAD_IS_MMAP(memhead) ((memhead)->len & (size_t) 1)
+#define MEMHEAD_ALIGNED_FROM_PTR(ptr) (((MemHeadAligned*) vmemh) - 1)
+#define PTR_FROM_MEMHEAD_ALIGNED(memhead) (memhead + 1)
+#define MEMHEAD_IS_MMAP(memhead) ((memhead)->len & (size_t) MEMHEAD_MMAP_FLAG)
+#define MEMHEAD_IS_ALIGNED(memhead) ((memhead)->len & (size_t) MEMHEAD_ALIGN_FLAG)
 
 #ifdef __GNUC__
 __attribute__ ((format(printf, 1, 2)))
@@ -93,7 +106,7 @@ static void mem_unlock_thread(void)
 size_t MEM_lockfree_allocN_len(const void *vmemh)
 {
        if (vmemh) {
-               return MEMHEAD_FROM_PTR(vmemh)->len & ~((size_t) 1);
+               return MEMHEAD_FROM_PTR(vmemh)->len & ~((size_t) (MEMHEAD_MMAP_FLAG | MEMHEAD_ALIGN_FLAG));
        }
        else {
                return 0;
@@ -124,7 +137,13 @@ void MEM_lockfree_freeN(void *vmemh)
                if (UNLIKELY(malloc_debug_memset && len)) {
                        memset(memh + 1, 255, len);
                }
-               free(memh);
+               if (UNLIKELY(MEMHEAD_IS_ALIGNED(memh))) {
+                       MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+                       aligned_free(MEMHEAD_REAL_PTR(memh_aligned));
+               }
+               else {
+                       free(memh);
+               }
        }
 }
 
@@ -134,9 +153,16 @@ void *MEM_lockfree_dupallocN(const void *vmemh)
        if (vmemh) {
                MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
                const size_t prev_size = MEM_allocN_len(vmemh);
-               if (MEMHEAD_IS_MMAP(memh)) {
+               if (UNLIKELY(MEMHEAD_IS_MMAP(memh))) {
                        newp = MEM_lockfree_mapallocN(prev_size, "dupli_mapalloc");
                }
+               else if (UNLIKELY(MEMHEAD_IS_ALIGNED(memh))) {
+                       MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+                       newp = MEM_lockfree_mallocN_aligned(
+                               prev_size,
+                               (size_t)memh_aligned->alignment,
+                               "dupli_malloc");
+               }
                else {
                        newp = MEM_lockfree_mallocN(prev_size, "dupli_malloc");
                }
@@ -150,9 +176,20 @@ void *MEM_lockfree_reallocN_id(void *vmemh, size_t len, const char *str)
        void *newp = NULL;
 
        if (vmemh) {
+               MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
                size_t old_len = MEM_allocN_len(vmemh);
 
-               newp = MEM_lockfree_mallocN(len, "realloc");
+               if (LIKELY(!MEMHEAD_IS_ALIGNED(memh))) {
+                       newp = MEM_lockfree_mallocN(len, "realloc");
+               }
+               else {
+                       MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+                       newp = MEM_lockfree_mallocN_aligned(
+                               old_len,
+                               (size_t)memh_aligned->alignment,
+                               "realloc");
+               }
+
                if (newp) {
                        if (len < old_len) {
                                /* shrink */
@@ -178,9 +215,19 @@ void *MEM_lockfree_recallocN_id(void *vmemh, size_t len, const char *str)
        void *newp = NULL;
 
        if (vmemh) {
+               MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
                size_t old_len = MEM_allocN_len(vmemh);
 
-               newp = MEM_lockfree_mallocN(len, "recalloc");
+               if (LIKELY(!MEMHEAD_IS_ALIGNED(memh))) {
+                       newp = MEM_lockfree_mallocN(len, "recalloc");
+               }
+               else {
+                       MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+                       newp = MEM_lockfree_mallocN_aligned(old_len,
+                                                           (size_t)memh_aligned->alignment,
+                                                           "recalloc");
+               }
+
                if (newp) {
                        if (len < old_len) {
                                /* shrink */
@@ -256,6 +303,57 @@ void *MEM_lockfree_mallocN(size_t len, const char *str)
        return NULL;
 }
 
+void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *str)
+{
+       MemHeadAligned *memh;
+
+       /* It's possible that MemHead's size is not properly aligned,
+        * do extra padding to deal with this.
+        *
+        * We only support small alignments which fits into short in
+        * order to save some bits in MemHead structure.
+        */
+       size_t extra_padding = MEMHEAD_ALIGN_PADDING(alignment);
+
+       /* Huge alignment values doesn't make sense and they
+        * wouldn't fit into 'short' used in the MemHead.
+        */
+       assert(alignment < 1024);
+
+       /* We only support alignment to a power of two. */
+       assert(IS_POW2(alignment));
+
+       len = SIZET_ALIGN_4(len);
+
+       memh = (MemHeadAligned *)aligned_malloc(
+               len + extra_padding + sizeof(MemHeadAligned), alignment);
+
+       if (LIKELY(memh)) {
+               /* We keep padding in the beginning of MemHead,
+                * this way it's always possible to get MemHead
+                * from the data pointer.
+                */
+               memh = (MemHeadAligned *)((char *)memh + extra_padding);
+
+               if (UNLIKELY(malloc_debug_memset && len)) {
+                       memset(memh + 1, 255, len);
+               }
+
+               memh->len = len | (size_t) MEMHEAD_ALIGN_FLAG;
+               memh->alignment = (short) alignment;
+               atomic_add_u(&totblock, 1);
+               atomic_add_z(&mem_in_use, len);
+
+               /* TODO(sergey): Not strictly speaking thread-safe. */
+               peak_mem = mem_in_use > peak_mem ? mem_in_use : peak_mem;
+
+               return PTR_FROM_MEMHEAD(memh);
+       }
+       print_error("Malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
+                   SIZET_ARG(len), str, (unsigned int) mem_in_use);
+       return NULL;
+}
+
 void *MEM_lockfree_mapallocN(size_t len, const char *str)
 {
        MemHead *memh;
@@ -279,7 +377,7 @@ void *MEM_lockfree_mapallocN(size_t len, const char *str)
 #endif
 
        if (memh != (MemHead *)-1) {
-               memh->len = len | (size_t) 1;
+               memh->len = len | (size_t) MEMHEAD_MMAP_FLAG;
                atomic_add_u(&totblock, 1);
                atomic_add_z(&mem_in_use, len);
                atomic_add_z(&mmap_in_use, len);
index 04828bfe3f84e592c2b6bd5ab70dfaaae338200f..c1916f4a68f88b615f65bb4a8d1ef31ed4e87c5c 100644 (file)
@@ -46,7 +46,7 @@ MemoryBuffer::MemoryBuffer(MemoryProxy *memoryProxy, unsigned int chunkNumber, r
        BLI_rcti_init(&this->m_rect, rect->xmin, rect->xmax, rect->ymin, rect->ymax);
        this->m_memoryProxy = memoryProxy;
        this->m_chunkNumber = chunkNumber;
-       this->m_buffer = (float *)MEM_mallocN(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, "COM_MemoryBuffer");
+       this->m_buffer = (float *)MEM_mallocN_aligned(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, 16, "COM_MemoryBuffer");
        this->m_state = COM_MB_ALLOCATED;
        this->m_datatype = COM_DT_COLOR;
        this->m_chunkWidth = this->m_rect.xmax - this->m_rect.xmin;
@@ -57,7 +57,7 @@ MemoryBuffer::MemoryBuffer(MemoryProxy *memoryProxy, rcti *rect)
        BLI_rcti_init(&this->m_rect, rect->xmin, rect->xmax, rect->ymin, rect->ymax);
        this->m_memoryProxy = memoryProxy;
        this->m_chunkNumber = -1;
-       this->m_buffer = (float *)MEM_mallocN(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, "COM_MemoryBuffer");
+       this->m_buffer = (float *)MEM_mallocN_aligned(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, 16, "COM_MemoryBuffer");
        this->m_state = COM_MB_TEMPORARILY;
        this->m_datatype = COM_DT_COLOR;
        this->m_chunkWidth = this->m_rect.xmax - this->m_rect.xmin;
index e7af9319f88c94b3c3501cf7c3a65b100d9508a9..d5aafc7c2ae43be9e81f52f3b74e1227dba98d07 100644 (file)
@@ -91,6 +91,18 @@ float *BlurBaseOperation::make_gausstab(float rad, int size)
        return gausstab;
 }
 
+#ifdef __SSE2__
+__m128 *BlurBaseOperation::convert_gausstab_sse(const float *gausstab, float rad, int size)
+{
+       int n = 2 * size + 1;
+       __m128 *gausstab_sse = (__m128 *) MEM_mallocN_aligned(sizeof(__m128) * n, 16, "gausstab sse");
+       for (int i = 0; i < n; ++i) {
+               gausstab_sse[i] = _mm_set1_ps(gausstab[i]);
+       }
+       return gausstab_sse;
+}
+#endif
+
 /* normalized distance from the current (inverted so 1.0 is close and 0.0 is far)
  * 'ease' is applied after, looks nicer */
 float *BlurBaseOperation::make_dist_fac_inverse(float rad, int size, int falloff)
index 052a525ef2cc61f8e996e837797ca1c698c40325..e97dd4d766db1cbd43f35efdb9b66ce1a94ed13c 100644 (file)
 
 #define MAX_GAUSSTAB_RADIUS 30000
 
+#ifdef __SSE2__
+#  include <emmintrin.h>
+#endif
+
 class BlurBaseOperation : public NodeOperation, public QualityStepHelper {
 private:
 
@@ -34,6 +38,9 @@ protected:
 
        BlurBaseOperation(DataType data_type);
        float *make_gausstab(float rad, int size);
+#ifdef __SSE2__
+       __m128 *convert_gausstab_sse(const float *gaustab, float rad, int size);
+#endif
        float *make_dist_fac_inverse(float rad, int size, int falloff);
 
        void updateSize();
index d08924ca4efa24bbc088ac8ec3b480044d243eb6..0aefba3bb7c7166bef391db536159fce2d35b330 100644 (file)
@@ -31,6 +31,9 @@ extern "C" {
 GaussianXBlurOperation::GaussianXBlurOperation() : BlurBaseOperation(COM_DT_COLOR)
 {
        this->m_gausstab = NULL;
+#ifdef __SSE2__
+       this->m_gausstab_sse = NULL;
+#endif
        this->m_filtersize = 0;
 }
 
@@ -54,8 +57,14 @@ void GaussianXBlurOperation::initExecution()
        if (this->m_sizeavailable) {
                float rad = max_ff(m_size * m_data.sizex, 0.0f);
                m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-               
+
+               /* TODO(sergey): De-duplicate with the case below and Y blur. */
                this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+               this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+                                                                              rad,
+                                                                              m_filtersize);
+#endif
        }
 }
 
@@ -65,8 +74,13 @@ void GaussianXBlurOperation::updateGauss()
                updateSize();
                float rad = max_ff(m_size * m_data.sizex, 0.0f);
                m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-               
+
                this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+               this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+                                                                              rad,
+                                                                              m_filtersize);
+#endif
        }
 }
 
@@ -88,12 +102,25 @@ void GaussianXBlurOperation::executePixel(float output[4], int x, int y, void *d
        int step = getStep();
        int offsetadd = getOffsetAdd();
        int bufferindex = ((xmin - bufferstartx) * 4) + ((ymin - bufferstarty) * 4 * bufferwidth);
+
+#ifdef __SSE2__
+       __m128 accum_r = _mm_load_ps(color_accum);
+       for (int nx = xmin, index = (xmin - x) + this->m_filtersize; nx < xmax; nx += step, index += step) {
+               __m128 reg_a = _mm_load_ps(&buffer[bufferindex]);
+               reg_a = _mm_mul_ps(reg_a, this->m_gausstab_sse[index]);
+               accum_r = _mm_add_ps(accum_r, reg_a);
+               multiplier_accum += this->m_gausstab[index];
+               bufferindex += offsetadd;
+       }
+       _mm_store_ps(color_accum, accum_r);
+#else
        for (int nx = xmin, index = (xmin - x) + this->m_filtersize; nx < xmax; nx += step, index += step) {
                const float multiplier = this->m_gausstab[index];
                madd_v4_v4fl(color_accum, &buffer[bufferindex], multiplier);
                multiplier_accum += multiplier;
                bufferindex += offsetadd;
        }
+#endif
        mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum);
 }
 
@@ -105,6 +132,12 @@ void GaussianXBlurOperation::deinitExecution()
                MEM_freeN(this->m_gausstab);
                this->m_gausstab = NULL;
        }
+#ifdef __SSE2__
+       if (this->m_gausstab_sse) {
+               MEM_freeN(this->m_gausstab_sse);
+               this->m_gausstab_sse = NULL;
+       }
+#endif
 
        deinitMutex();
 }
index 6442f2141381e60028f0fd9c3d5a60e7f6051b02..e391320a007120c320747e3596778978db3f9175 100644 (file)
@@ -28,6 +28,9 @@
 class GaussianXBlurOperation : public BlurBaseOperation {
 private:
        float *m_gausstab;
+#ifdef __SSE2__
+       __m128 *m_gausstab_sse;
+#endif
        int m_filtersize;
        void updateGauss();
 public:
index 8216b79372fe9a4cc44209473dc6f60f68ff2410..a05a1ab6a23a939bd390ce26f2c956b1422179dc 100644 (file)
@@ -31,6 +31,9 @@ extern "C" {
 GaussianYBlurOperation::GaussianYBlurOperation() : BlurBaseOperation(COM_DT_COLOR)
 {
        this->m_gausstab = NULL;
+#ifdef __SSE2__
+       this->m_gausstab_sse = NULL;
+#endif
        this->m_filtersize = 0;
 }
 
@@ -54,8 +57,13 @@ void GaussianYBlurOperation::initExecution()
        if (this->m_sizeavailable) {
                float rad = max_ff(m_size * m_data.sizey, 0.0f);
                m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-               
+
                this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+               this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+                                                                              rad,
+                                                                              m_filtersize);
+#endif
        }
 }
 
@@ -65,8 +73,13 @@ void GaussianYBlurOperation::updateGauss()
                updateSize();
                float rad = max_ff(m_size * m_data.sizey, 0.0f);
                m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-               
+
                this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+               this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+                                                                              rad,
+                                                                              m_filtersize);
+#endif
        }
 }
 
@@ -88,6 +101,20 @@ void GaussianYBlurOperation::executePixel(float output[4], int x, int y, void *d
        int index;
        int step = getStep();
        const int bufferIndexx = ((xmin - bufferstartx) * 4);
+
+#ifdef __SSE2__
+       __m128 accum_r = _mm_load_ps(color_accum);
+       for (int ny = ymin; ny < ymax; ny += step) {
+               index = (ny - y) + this->m_filtersize;
+               int bufferindex = bufferIndexx + ((ny - bufferstarty) * 4 * bufferwidth);
+               const float multiplier = this->m_gausstab[index];
+               __m128 reg_a = _mm_load_ps(&buffer[bufferindex]);
+               reg_a = _mm_mul_ps(reg_a, this->m_gausstab_sse[index]);
+               accum_r = _mm_add_ps(accum_r, reg_a);
+               multiplier_accum += multiplier;
+       }
+       _mm_store_ps(color_accum, accum_r);
+#else
        for (int ny = ymin; ny < ymax; ny += step) {
                index = (ny - y) + this->m_filtersize;
                int bufferindex = bufferIndexx + ((ny - bufferstarty) * 4 * bufferwidth);
@@ -95,6 +122,7 @@ void GaussianYBlurOperation::executePixel(float output[4], int x, int y, void *d
                madd_v4_v4fl(color_accum, &buffer[bufferindex], multiplier);
                multiplier_accum += multiplier;
        }
+#endif
        mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum);
 }
 
@@ -106,6 +134,12 @@ void GaussianYBlurOperation::deinitExecution()
                MEM_freeN(this->m_gausstab);
                this->m_gausstab = NULL;
        }
+#ifdef __SSE2__
+       if (this->m_gausstab_sse) {
+               MEM_freeN(this->m_gausstab_sse);
+               this->m_gausstab_sse = NULL;
+       }
+#endif
 
        deinitMutex();
 }
index 16503360de20c20c25da4193e07b7634a180657b..22b6562077d6782b84e8a2a0db32d3ed93b1fbcd 100644 (file)
@@ -28,6 +28,9 @@
 class GaussianYBlurOperation : public BlurBaseOperation {
 private:
        float *m_gausstab;
+#ifdef __SSE2__
+       __m128 *m_gausstab_sse;
+#endif
        int m_filtersize;
        void updateGauss();
 public: