Optimization of parallel range
authorSergey Sharybin <sergey.vfx@gmail.com>
Mon, 3 Nov 2014 17:24:08 +0000 (18:24 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Mon, 3 Nov 2014 17:44:29 +0000 (22:44 +0500)
It now supports different scheduling schemas: dynamic and static.
Static one is the default and it splits work into equal number of
range iterations.

Dynamic one allocates chunks of 32 iterations which then being
dynamically send to a thread which is currently idling.

This gives slightly better performance. Still some tricks are
possible to have. For example we can use some smarter static scheduling
when one thread might steal tasks from another threads when it runs
out of work to be done.

Also removed unneeded spin lock in the mesh deform evaluation,
on the first glance it seemed to be a reduction involved here but
int fact threads are just adding value to the original vertex
coordinates. No write access to the same element of  vertexCos
happens from separate threads.

source/blender/blenlib/BLI_task.h
source/blender/blenlib/intern/task.c
source/blender/modifiers/intern/MOD_meshdeform.c

index 8c22a25fe147bd3d25ca17c27bd266ca2632dd28..28da673ea97176dad0433916fca81db8a8e9df78 100644 (file)
@@ -106,7 +106,8 @@ void BLI_task_parallel_range_ex(
         int start, int stop,
         void *userdata,
         TaskParallelRangeFunc func,
-        const int range_threshold);
+        const int range_threshold,
+        const bool use_dynamic_scheduling);
 void BLI_task_parallel_range(
         int start, int stop,
         void *userdata,
index 07c67f001f9b2e86cb216ae71da4e456da35b59e..219ccb18d986d29d34df8a71228c6038fae40462 100644 (file)
@@ -29,6 +29,7 @@
 #include "MEM_guardedalloc.h"
 
 #include "BLI_listbase.h"
+#include "BLI_math.h"
 #include "BLI_task.h"
 #include "BLI_threads.h"
 
@@ -452,18 +453,21 @@ typedef struct ParallelRangeState {
        TaskParallelRangeFunc func;
 
        int iter;
+       int chunk_size;
        SpinLock lock;
 } ParallelRangeState;
 
 BLI_INLINE bool parallel_range_next_iter_get(
-        ParallelRangeState *state,
-        int *iter)
+        ParallelRangeState * __restrict state,
+        int * __restrict iter, int * __restrict count)
 {
        bool result = false;
        if (state->iter < state->stop) {
                BLI_spin_lock(&state->lock);
                if (state->iter < state->stop) {
-                       *iter = state->iter++;
+                       *count = min_ii(state->chunk_size, state->stop - state->iter);
+                       *iter = state->iter;
+                       state->iter += *count;
                        result = true;
                }
                BLI_spin_unlock(&state->lock);
@@ -472,14 +476,17 @@ BLI_INLINE bool parallel_range_next_iter_get(
 }
 
 static void parallel_range_func(
-        TaskPool *pool,
+        TaskPool * __restrict pool,
         void *UNUSED(taskdata),
         int UNUSED(threadid))
 {
-       ParallelRangeState *state = BLI_task_pool_userdata(pool);
-       int iter;
-       while (parallel_range_next_iter_get(state, &iter)) {
-               state->func(state->userdata, iter);
+       ParallelRangeState * __restrict state = BLI_task_pool_userdata(pool);
+       int iter, count;
+       while (parallel_range_next_iter_get(state, &iter, &count)) {
+               int i;
+               for (i = 0; i < count; ++i) {
+                       state->func(state->userdata, iter + i);
+               }
        }
 }
 
@@ -487,12 +494,13 @@ void BLI_task_parallel_range_ex(
         int start, int stop,
         void *userdata,
         TaskParallelRangeFunc func,
-        const int range_threshold)
+        const int range_threshold,
+        const bool use_dynamic_scheduling)
 {
        TaskScheduler *task_scheduler;
        TaskPool *task_pool;
        ParallelRangeState state;
-       int i;
+       int i, num_threads, num_tasks;
 
        BLI_assert(start < stop);
 
@@ -506,21 +514,30 @@ void BLI_task_parallel_range_ex(
                return;
        }
 
-       BLI_spin_init(&state.lock);
-       state.start = start;
-       state.stop = stop;
-       state.userdata = userdata;
-       state.func = func;
-       state.iter = start;
-
        task_scheduler = BLI_task_scheduler_get();
        task_pool = BLI_task_pool_create(task_scheduler, &state);
+       num_threads = BLI_task_scheduler_num_threads(task_scheduler);
 
        /* The idea here is to prevent creating task for each of the loop iterations
         * and instead have tasks which are evenly distributed across CPU cores and
         * pull next iter to be crunched using the queue.
         */
-       for (i = 0; i < 2 * BLI_task_scheduler_num_threads(task_scheduler); i++) {
+       num_tasks = num_threads * 2;
+
+       BLI_spin_init(&state.lock);
+       state.start = start;
+       state.stop = stop;
+       state.userdata = userdata;
+       state.func = func;
+       state.iter = start;
+       if (use_dynamic_scheduling) {
+               state.chunk_size = 32;
+       }
+       else {
+               state.chunk_size = (stop - start) / (num_tasks);
+       }
+
+       for (i = 0; i < num_tasks; i++) {
                BLI_task_pool_push(task_pool,
                                   parallel_range_func,
                                   NULL, false,
@@ -538,5 +555,5 @@ void BLI_task_parallel_range(
         void *userdata,
         TaskParallelRangeFunc func)
 {
-       BLI_task_parallel_range_ex(start, stop, userdata, func, 64);
+       BLI_task_parallel_range_ex(start, stop, userdata, func, 64, false);
 }
index 472f35f3d181c20480bf4956c52dc2b262c3296a..c253c0314ad3b5a5eb4d50a812df1f3c77b1694b 100644 (file)
@@ -213,10 +213,9 @@ typedef struct MeshdeformUserdata {
        float (*vertexCos)[3];
        float (*cagemat)[4];
        float (*icagemat)[3];
-       SpinLock lock;
 } MeshdeformUserdata;
 
-static void meshdeform_vert_task(void *userdata, int iter)
+static void meshdeform_vert_task(void * userdata, int iter)
 {
        MeshdeformUserdata *data = userdata;
        /*const*/ MeshDeformModifierData *mmd = data->mmd;
@@ -265,12 +264,10 @@ static void meshdeform_vert_task(void *userdata, int iter)
        if (totweight > 0.0f) {
                mul_v3_fl(co, fac / totweight);
                mul_m3_v3(data->icagemat, co);
-               BLI_spin_lock(&data->lock);
                if (G.debug_value != 527)
                        add_v3_v3(vertexCos[iter], co);
                else
                        copy_v3_v3(vertexCos[iter], co);
-               BLI_spin_unlock(&data->lock);
        }
 }
 
@@ -395,14 +392,10 @@ static void meshdeformModifier_do(
        data.vertexCos = vertexCos;
        data.cagemat = cagemat;
        data.icagemat = icagemat;
-       BLI_spin_init(&data.lock);
 
        /* Do deformation. */
        BLI_task_parallel_range(0, totvert, &data, meshdeform_vert_task);
 
-       /* Uninitialize user dtaa used by the task system. */
-       BLI_spin_end(&data.lock);
-
        /* release cage derivedmesh */
        MEM_freeN(dco);
        MEM_freeN(cagecos);