BLI_task: nano-optimizations to BLI_task_parallel_range feature.
authorBastien Montagne <montagne29@wanadoo.fr>
Tue, 10 May 2016 15:49:27 +0000 (17:49 +0200)
committerBastien Montagne <montagne29@wanadoo.fr>
Tue, 10 May 2016 15:57:53 +0000 (17:57 +0200)
This commit makes use of new taskpool feature (instead of allocating own tasks),
and removes the spinlock used to generate chunks (using atomic ops instead).

In best cases (dynamic scheduled loop with light processing func callback), we
get a few percents of speedup, in most cases there is no sensible enhancement.

source/blender/blenlib/intern/task.c

index b47931cdde9043ed1e8d43390c4d1afdf311c4ea..bebf331e0c10329a84385a21f41e6cb862d75b2c 100644 (file)
@@ -777,23 +777,29 @@ typedef struct ParallelRangeState {
 
        int iter;
        int chunk_size;
-       SpinLock lock;
 } ParallelRangeState;
 
 BLI_INLINE bool parallel_range_next_iter_get(
         ParallelRangeState * __restrict state,
         int * __restrict iter, int * __restrict count)
 {
-       bool result = false;
-       BLI_spin_lock(&state->lock);
-       if (state->iter < state->stop) {
-               *count = min_ii(state->chunk_size, state->stop - state->iter);
-               *iter = state->iter;
-               state->iter += *count;
-               result = true;
+       uint32_t n, olditer, previter, newiter;
+
+       if (state->iter >= state->stop) {
+               return false;
        }
-       BLI_spin_unlock(&state->lock);
-       return result;
+
+       do {
+               olditer = state->iter;
+               n = min_ii(state->chunk_size, state->stop - state->iter);
+               newiter = olditer + n;
+               previter = atomic_cas_uint32((uint32_t *)&state->iter, olditer, newiter);
+       } while (UNLIKELY(previter != olditer));
+
+       *iter = previter;
+       *count = n;
+
+       return (n != 0);
 }
 
 static void parallel_range_func(
@@ -898,7 +904,6 @@ static void task_parallel_range_ex(
         */
        num_tasks = num_threads * 2;
 
-       BLI_spin_init(&state.lock);
        state.start = start;
        state.stop = stop;
        state.userdata = userdata;
@@ -917,16 +922,15 @@ static void task_parallel_range_ex(
        num_tasks = min_ii(num_tasks, (stop - start) / state.chunk_size);
 
        for (i = 0; i < num_tasks; i++) {
-               BLI_task_pool_push(task_pool,
-                                  parallel_range_func,
-                                  NULL, false,
-                                  TASK_PRIORITY_HIGH);
+               /* Use this pool's pre-allocated tasks. */
+               BLI_task_pool_push_from_thread(task_pool,
+                                              parallel_range_func,
+                                              NULL, false,
+                                              TASK_PRIORITY_HIGH, 0);
        }
 
        BLI_task_pool_work_and_wait(task_pool);
        BLI_task_pool_free(task_pool);
-
-       BLI_spin_end(&state.lock);
 }
 
 /**