Task scheduler: Optimize parallel loop over lists
authorSergey Sharybin <sergey.vfx@gmail.com>
Tue, 20 Nov 2018 11:17:03 +0000 (12:17 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Tue, 20 Nov 2018 13:58:17 +0000 (14:58 +0100)
The goal is to address performance regression when going from
few threads to 10s of threads. On a systems with more than 32
CPU threads the benefit of threaded loop was actually harmful.

There are following tweaks now:

- The chunk size is adaptive for the number of threads, which
  minimizes scheduling overhead.

- The number of tasks is adaptive to the list size and chunk
  size.

Here comes performance comparison on the production shot:

 Number of threads        DEG time before        DEG time after
       44                     0.09                   0.02
       32                     0.055                  0.025
       16                     0.025                  0.025
       8                      0.035                  0.033

source/blender/blenlib/intern/task.c

index 2bb5d5397a93b3a57d62eaab20d1aab6918a796d..b6d704d8d822cc3426e250c0e66c7be05ecdbe60 100644 (file)
@@ -1221,6 +1221,31 @@ static void parallel_listbase_func(
        }
 }
 
+static void task_parallel_listbase_no_threads(
+        struct ListBase *listbase,
+        void *userdata,
+        TaskParallelListbaseFunc func)
+{
+       int i = 0;
+       for (Link *link = listbase->first; link != NULL; link = link->next, ++i) {
+               func(userdata, link, i);
+       }
+}
+
+/* NOTE: The idea here is to compensate for rather measurable threading
+ * overhead caused by fetching tasks. With too many CPU threads we are starting
+ * to spend too much time in those overheads. */
+BLI_INLINE int task_parallel_listbasecalc_chunk_size(const int num_threads)
+{
+       if (num_threads > 32) {
+               return 128;
+       }
+       else if (num_threads > 16) {
+               return 64;
+       }
+       return 32;
+}
+
 /**
  * This function allows to parallelize for loops over ListBase items.
  *
@@ -1238,41 +1263,37 @@ void BLI_task_parallel_listbase(
         TaskParallelListbaseFunc func,
         const bool use_threading)
 {
-       TaskScheduler *task_scheduler;
-       TaskPool *task_pool;
-       ParallelListState state;
-       int i, num_threads, num_tasks;
-
        if (BLI_listbase_is_empty(listbase)) {
                return;
        }
-
        if (!use_threading) {
-               i = 0;
-               for (Link *link = listbase->first; link != NULL; link = link->next, ++i) {
-                       func(userdata, link, i);
-               }
+               task_parallel_listbase_no_threads(listbase, userdata, func);
+               return;
+       }
+       TaskScheduler *task_scheduler = BLI_task_scheduler_get();
+       const int num_threads = BLI_task_scheduler_num_threads(task_scheduler);
+       /* TODO(sergey): Consider making chunk size configurable. */
+       const int chunk_size = task_parallel_listbasecalc_chunk_size(num_threads);
+       const int num_tasks = min_ii(
+               num_threads,
+               BLI_listbase_count(listbase) / chunk_size);
+       if (num_tasks <= 1) {
+               task_parallel_listbase_no_threads(listbase, userdata, func);
                return;
        }
 
-       task_scheduler = BLI_task_scheduler_get();
-       task_pool = BLI_task_pool_create_suspended(task_scheduler, &state);
-       num_threads = BLI_task_scheduler_num_threads(task_scheduler);
-
-       /* The idea here is to prevent creating task for each of the loop iterations
-        * and instead have tasks which are evenly distributed across CPU cores and
-        * pull next iter to be crunched using the queue.
-        */
-       num_tasks = num_threads + 2;
+       ParallelListState state;
+       TaskPool *task_pool = BLI_task_pool_create_suspended(task_scheduler, &state);
 
        state.index = 0;
        state.link = listbase->first;
        state.userdata = userdata;
        state.func = func;
-       state.chunk_size = 32;
+       state.chunk_size = chunk_size;
        BLI_spin_init(&state.lock);
 
-       for (i = 0; i < num_tasks; i++) {
+       BLI_assert(num_tasks > 0);
+       for (int i = 0; i < num_tasks; i++) {
                /* Use this pool's pre-allocated tasks. */
                BLI_task_pool_push_from_thread(task_pool,
                                               parallel_listbase_func,