Fix T59874: Cycles CPU 25% load only during rendering
authorSergey Sharybin <sergey.vfx@gmail.com>
Thu, 27 Dec 2018 18:01:19 +0000 (19:01 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Thu, 27 Dec 2018 18:12:59 +0000 (19:12 +0100)
The issue was introduced by a Threadripper2 commit back in
ce927e15e0e3. This boils down to threads inheriting affinity
from the parent thread. It is a question how this slipped
through the review (we definitely run benchmark round).

Quick fix could have been to always set CPU group affinity
in Cycles, and it would work for Windows. On other platforms
we did not have CPU groups API finished.

Ended up making Cycles aware of NUMA topology, so now we
bound threads to a specific NUMA node. This required adding
an external dependency to Cycles, but made some code there
shorter.

intern/cycles/util/CMakeLists.txt
intern/cycles/util/util_system.cpp
intern/cycles/util/util_system.h
intern/cycles/util/util_task.cpp
intern/cycles/util/util_thread.cpp
intern/cycles/util/util_thread.h
intern/cycles/util/util_windows.cpp [deleted file]
intern/cycles/util/util_windows.h
intern/numaapi/include/numaapi.h
intern/numaapi/source/numaapi_linux.c

index 92dfc9fa85d82df278297993808f07f9f2c2c04f..42626d05cf96ad951fb62703ea9130f0ab04e891 100644 (file)
@@ -25,13 +25,17 @@ set(SRC
        util_thread.cpp
        util_time.cpp
        util_transform.cpp
-       util_windows.cpp
 )
 
-if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
-       list(APPEND SRC
-               util_view.cpp
-       )
+if(WITH_CYCLES_STANDALONE)
+       if (WITH_CYCLES_STANDALONE_GUI)
+               list(APPEND SRC
+                       util_view.cpp
+               )
+       endif()
+       list(APPEND INC_SYS ../../third_party/numaapi/include)
+else()
+       list(APPEND INC_SYS ../../numaapi/include)
 endif()
 
 set(SRC_HEADERS
index 34f428f111c050df436a4e163a0e503f130110a6..cc2d7017fd8548ecda21834d92e60159e9d5a839 100644 (file)
@@ -20,6 +20,8 @@
 #include "util/util_types.h"
 #include "util/util_string.h"
 
+#include <numaapi.h>
+
 #ifdef _WIN32
 #  if(!defined(FREE_WINDOWS))
 #    include <intrin.h>
 
 CCL_NAMESPACE_BEGIN
 
-int system_cpu_group_count()
+bool system_cpu_ensure_initialized()
 {
-#ifdef _WIN32
-       util_windows_init_numa_groups();
-       return GetActiveProcessorGroupCount();
-#else
-       /* TODO(sergey): Need to adopt for other platforms. */
-       return 1;
-#endif
+       static bool is_initialized = false;
+       static bool result = false;
+       if (is_initialized) {
+               return result;
+       }
+       is_initialized = true;
+       const NUMAAPI_Result numa_result = numaAPI_Initialize();
+       result = (numa_result == NUMAAPI_SUCCESS);
+       return result;
 }
 
-int system_cpu_group_thread_count(int group)
+/* Fallback solution, which doesn't use NUMA/CPU groups. */
+static int system_cpu_thread_count_fallback()
 {
-       /* TODO(sergey): Need make other platforms aware of groups. */
 #ifdef _WIN32
-       util_windows_init_numa_groups();
-       return GetActiveProcessorCount(group);
+       SYSTEM_INFO info;
+       GetSystemInfo(&info);
+       return info.dwNumberOfProcessors;
 #elif defined(__APPLE__)
-       (void) group;
        int count;
        size_t len = sizeof(count);
        int mib[2] = { CTL_HW, HW_NCPU };
        sysctl(mib, 2, &count, &len, NULL, 0);
        return count;
 #else
-       (void) group;
        return sysconf(_SC_NPROCESSORS_ONLN);
 #endif
 }
 
 int system_cpu_thread_count()
 {
-       static uint count = 0;
-
-       if(count > 0) {
-               return count;
+       const int num_nodes = system_cpu_num_numa_nodes();
+       int num_threads = 0;
+       for (int node = 0; node < num_nodes; ++node) {
+               if (!system_cpu_is_numa_node_available(node)) {
+                       continue;
+               }
+               num_threads += system_cpu_num_numa_node_processors(node);
        }
+       return num_threads;
+}
 
-       int max_group = system_cpu_group_count();
-       VLOG(1) << "Detected " << max_group << " CPU groups.";
-       for(int group = 0; group < max_group; ++group) {
-               int num_threads = system_cpu_group_thread_count(group);
-               VLOG(1) << "Group " << group
-                       << " has " << num_threads << " threads.";
-               count += num_threads;
+int system_cpu_num_numa_nodes()
+{
+       if (!system_cpu_ensure_initialized()) {
+               /* Fallback to a single node with all the threads. */
+               return 1;
        }
+       return numaAPI_GetNumNodes();
+}
 
-       if(count < 1) {
-               count = 1;
+bool system_cpu_is_numa_node_available(int node)
+{
+       if (!system_cpu_ensure_initialized()) {
+               return true;
        }
+       return numaAPI_IsNodeAvailable(node);
+}
 
-       return count;
+int system_cpu_num_numa_node_processors(int node)
+{
+       if (!system_cpu_ensure_initialized()) {
+               return system_cpu_thread_count_fallback();
+       }
+       return numaAPI_GetNumNodeProcessors(node);
 }
 
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *groups)
+bool system_cpu_run_thread_on_node(int node)
 {
-#ifdef _WIN32
-       unsigned short group_count = max_groups;
-       if(!GetProcessGroupAffinity(GetCurrentProcess(), &group_count, groups)) {
-               return 0;
+       if (!system_cpu_ensure_initialized()) {
+               return true;
        }
-       return group_count;
-#else
-       (void) max_groups;
-       (void) groups;
-       return 0;
-#endif
+       return numaAPI_RunThreadOnNode(node);
 }
 
 #if !defined(_WIN32) || defined(FREE_WINDOWS)
index 241ac89715748499ff5f2b5e124140efba5e1dc0..15f69bcf15354a4f485a1cdf45fafefb7919b585 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-/* Get number of available CPU groups. */
-int system_cpu_group_count();
+/* Make sure CPU groups / NUMA API is initialized. */
+bool system_cpu_ensure_initialized();
 
-/* Get number of threads/processors in the specified group. */
-int system_cpu_group_thread_count(int group);
-
-/* Get total number of threads in all groups. */
+/* Get total number of threads in all NUMA nodes / CPU groups. */
 int system_cpu_thread_count();
 
-/* Get current process groups. */
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *grpups);
+/* Get number of available nodes.
+ *
+ * This is in fact an index of last node plus one and it's not guaranteed
+ * that all nodes up to this one are available. */
+int system_cpu_num_numa_nodes();
+
+/* Returns truth if the given node is available for compute. */
+bool system_cpu_is_numa_node_available(int node);
+
+/* Get number of available processors on a given node. */
+int system_cpu_num_numa_node_processors(int node);
+
+/* Runs the current thread and its children on a specific node.
+ *
+ * Returns truth if affinity has successfully changed. */
+bool system_cpu_run_thread_on_node(int node);
 
 string system_cpu_brand_string();
 int system_cpu_bits();
index 2d21d6b5a18fe85031c2335b01fb88526db504c4..50a2bb160ff6a62446f91eb4829e330ba3031603 100644 (file)
@@ -204,50 +204,26 @@ void TaskScheduler::init(int num_threads)
                /* launch threads that will be waiting for work */
                threads.resize(num_threads);
 
-               const int num_groups = system_cpu_group_count();
-               unsigned short num_process_groups = 0;
-               vector<unsigned short> process_groups;
-               int current_group_threads = 0;
-               if(num_groups > 1) {
-                       process_groups.resize(num_groups);
-                       num_process_groups = system_cpu_process_groups(num_groups,
-                                                                      &process_groups[0]);
-                       if(num_process_groups == 1) {
-                               current_group_threads = system_cpu_group_thread_count(process_groups[0]);
-                       }
-               }
+               const int num_nodes = system_cpu_num_numa_nodes();
                int thread_index = 0;
-               for(int group = 0; group < num_groups; ++group) {
-                       /* NOTE: That's not really efficient from threading point of view,
-                        * but it is simple to read and it doesn't make sense to use more
-                        * user-specified threads than logical threads anyway.
-                        */
-                       int num_group_threads = (group == num_groups - 1)
-                               ? (threads.size() - thread_index)
-                               : system_cpu_group_thread_count(group);
-                       for(int group_thread = 0;
-                               group_thread < num_group_threads && thread_index < threads.size();
-                               ++group_thread, ++thread_index)
+               for (int node = 0;
+                    node < num_nodes && thread_index < threads.size();
+                    ++node)
+               {
+                       if (!system_cpu_is_numa_node_available(node)) {
+                               continue;
+                       }
+                       const int num_node_processors =
+                               system_cpu_num_numa_node_processors(node);
+                       for (int i = 0;
+                            i < num_node_processors && thread_index < threads.size();
+                            ++i)
                        {
-                               /* NOTE: Thread group of -1 means we would not force thread affinity. */
-                               int thread_group;
-                               if(num_groups == 1) {
-                                       /* Use default affinity if there's only one CPU group in the system. */
-                                       thread_group = -1;
-                               }
-                               else if(use_auto_threads &&
-                                       num_process_groups == 1 &&
-                                               num_threads <= current_group_threads)
-                               {
-                                       /* If we fit into curent CPU group we also don't force any affinity. */
-                                       thread_group = -1;
-                               }
-                               else {
-                                       thread_group = group;
-                               }
-                               threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run,
-                                                                                thread_index + 1),
-                                                                  thread_group);
+                               threads[thread_index] = new thread(
+                                       function_bind(&TaskScheduler::thread_run,
+                                                     thread_index + 1),
+                                       node);
+                               thread_index++;
                        }
                }
        }
index 37d8bdbd4b090f0026a0aed6ac329d6887fb201e..4d30e3f564fe37f3180bc57418ffef12d7dd49d3 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-thread::thread(function<void()> run_cb, int group)
+thread::thread(function<void()> run_cb, int node)
   : run_cb_(run_cb),
     joined_(false),
-       group_(group)
+       node_(node)
 {
        thread_ = std::thread(&thread::run, this);
 }
@@ -39,19 +39,8 @@ thread::~thread()
 void *thread::run(void *arg)
 {
        thread *self = (thread*)(arg);
-       if(self->group_ != -1) {
-#ifdef _WIN32
-               HANDLE thread_handle = GetCurrentThread();
-               GROUP_AFFINITY group_affinity = { 0 };
-               int num_threads = system_cpu_group_thread_count(self->group_);
-               group_affinity.Group = self->group_;
-               group_affinity.Mask = (num_threads == 64)
-                                             ? -1
-                                             :  (1ull << num_threads) - 1;
-               if(SetThreadGroupAffinity(thread_handle, &group_affinity, NULL) == 0) {
-                       fprintf(stderr, "Error setting thread affinity.\n");
-               }
-#endif
+       if (self->node_ != -1) {
+               system_cpu_run_thread_on_node(self->node_);
        }
        self->run_cb_();
        return NULL;
index 6250bb95dcfdc3f5ea5320d06f5c314a2f1eef22..d54199a37fc37654cad584e18f65e387fcf112e8 100644 (file)
@@ -46,7 +46,7 @@ typedef std::condition_variable thread_condition_variable;
 
 class thread {
 public:
-       thread(function<void()> run_cb, int group = -1);
+       thread(function<void()> run_cb, int node = -1);
        ~thread();
 
        static void *run(void *arg);
@@ -56,7 +56,7 @@ protected:
        function<void()> run_cb_;
        std::thread thread_;
        bool joined_;
-       int group_;
+       int node_;
 };
 
 /* Own wrapper around pthread's spin lock to make it's use easier. */
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
deleted file mode 100644 (file)
index 073db2a..0000000
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_windows.h"
-
-#ifdef _WIN32
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef _M_X64
-#  include <VersionHelpers.h>
-#endif
-
-#if _WIN32_WINNT < 0x0601
-tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
-tGetActiveProcessorCount *GetActiveProcessorCount;
-tSetThreadGroupAffinity *SetThreadGroupAffinity;
-tGetProcessGroupAffinity *GetProcessGroupAffinity;
-#endif
-
-static WORD GetActiveProcessorGroupCount_stub()
-{
-       return 1;
-}
-
-static DWORD GetActiveProcessorCount_stub(WORD /*GroupNumber*/)
-{
-       SYSTEM_INFO info;
-       GetSystemInfo(&info);
-       return info.dwNumberOfProcessors;
-}
-
-static BOOL SetThreadGroupAffinity_stub(
-        HANDLE /*hThread*/,
-        const GROUP_AFFINITY  * /*GroupAffinity*/,
-        PGROUP_AFFINITY /*PreviousGroupAffinity*/)
-{
-       return TRUE;
-}
-
-static BOOL GetProcessGroupAffinity_stub(HANDLE hProcess,
-                                         PUSHORT GroupCount,
-                                         PUSHORT GroupArray)
-{
-       if(*GroupCount < 1) {
-               return FALSE;
-       }
-       *GroupCount = 1;
-       GroupArray[0] = 0;
-       return TRUE;
-}
-
-static bool supports_numa()
-{
-#ifndef _M_X64
-       return false;
-#else
-       return IsWindows7OrGreater();
-#endif
-}
-
-void util_windows_init_numa_groups()
-{
-       static bool initialized = false;
-       if(initialized) {
-               return;
-       }
-       initialized = true;
-#if _WIN32_WINNT < 0x0601
-       if(!supports_numa()) {
-               /* Use stubs on platforms which doesn't have rean NUMA/Groups. */
-               GetActiveProcessorGroupCount = GetActiveProcessorGroupCount_stub;
-               GetActiveProcessorCount = GetActiveProcessorCount_stub;
-               SetThreadGroupAffinity = SetThreadGroupAffinity_stub;
-               GetProcessGroupAffinity = GetProcessGroupAffinity_stub;
-               return;
-       }
-       HMODULE kernel = GetModuleHandleA("kernel32.dll");
-#  define READ_SYMBOL(sym) sym = (t##sym*)GetProcAddress(kernel, #sym)
-       READ_SYMBOL(GetActiveProcessorGroupCount);
-       READ_SYMBOL(GetActiveProcessorCount);
-       READ_SYMBOL(SetThreadGroupAffinity);
-       READ_SYMBOL(GetProcessGroupAffinity);
-#  undef READ_SUMBOL
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif  /* _WIN32 */
index 9b9268fed7a9c2f641cd86c9f94576e08b7d041a..bd1bc85adff1a3be87cc5162852ae46f4cfc7412 100644 (file)
 
 #include <windows.h>
 
-CCL_NAMESPACE_BEGIN
-
-#if _WIN32_WINNT < 0x0601
-typedef WORD tGetActiveProcessorGroupCount();
-typedef DWORD tGetActiveProcessorCount(WORD GroupNumber);
-typedef BOOL tSetThreadGroupAffinity(HANDLE hThread,
-                                     const GROUP_AFFINITY  *GroupAffinity,
-                                     PGROUP_AFFINITY PreviousGroupAffinity);
-typedef BOOL tGetProcessGroupAffinity(HANDLE  hProcess,
-                                     PUSHORT GroupCount,
-                                     PUSHORT GroupArray);
-
-extern tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
-extern tGetActiveProcessorCount *GetActiveProcessorCount;
-extern tSetThreadGroupAffinity *SetThreadGroupAffinity;
-extern tGetProcessGroupAffinity *GetProcessGroupAffinity;
-#endif
-
-/* Make sure NUMA and processor groups API is initialized. */
-void util_windows_init_numa_groups();
-
-CCL_NAMESPACE_END
-
-#endif  /* WIN32 */
+#endif  /* _WIN32 */
 
 #endif  /* __UTIL_WINDOWS_H__ */
index a4f32d8845892ca28365906d1cc92579b0ae8358..7b5b50fdf39ba3e9df12cf2d1d134b29ef66e31e 100644 (file)
@@ -67,7 +67,7 @@ int numaAPI_GetNumNodes(void);
 // Returns truth if the given node is available for compute.
 bool numaAPI_IsNodeAvailable(int node);
 
-// Getnumber of available processors on a given node.
+// Get number of available processors on a given node.
 int numaAPI_GetNumNodeProcessors(int node);
 
 ////////////////////////////////////////////////////////////////////////////////
index 559e97b67d3464ebe51e6aa0750a073a72899ebc..62e9dcdfadfee3625279156ef86b5de0306a7886 100644 (file)
@@ -34,6 +34,8 @@
 #  include <dlfcn.h>
 #endif
 
+#include <stdio.h>
+
 #ifdef WITH_DYNLOAD
 
 // Descriptor numa library.
@@ -61,6 +63,7 @@ typedef struct bitmask* tnuma_allocate_nodemask(void);
 typedef void tnuma_free_cpumask(struct bitmask* bitmask);
 typedef void tnuma_free_nodemask(struct bitmask* bitmask);
 typedef int tnuma_run_on_node_mask(struct bitmask *nodemask);
+typedef int tnuma_run_on_node_mask_all(struct bitmask *nodemask);
 typedef void tnuma_set_interleave_mask(struct bitmask *nodemask);
 typedef void tnuma_set_localalloc(void);
 
@@ -83,6 +86,7 @@ static tnuma_allocate_nodemask* numa_allocate_nodemask;
 static tnuma_free_nodemask* numa_free_nodemask;
 static tnuma_free_cpumask* numa_free_cpumask;
 static tnuma_run_on_node_mask* numa_run_on_node_mask;
+static tnuma_run_on_node_mask_all* numa_run_on_node_mask_all;
 static tnuma_set_interleave_mask* numa_set_interleave_mask;
 static tnuma_set_localalloc* numa_set_localalloc;
 
@@ -157,6 +161,7 @@ static NUMAAPI_Result loadNumaSymbols(void) {
   NUMA_LIBRARY_FIND(numa_free_cpumask);
   NUMA_LIBRARY_FIND(numa_free_nodemask);
   NUMA_LIBRARY_FIND(numa_run_on_node_mask);
+  NUMA_LIBRARY_FIND(numa_run_on_node_mask_all);
   NUMA_LIBRARY_FIND(numa_set_interleave_mask);
   NUMA_LIBRARY_FIND(numa_set_localalloc);
 
@@ -192,10 +197,7 @@ int numaAPI_GetNumNodes(void) {
 }
 
 bool numaAPI_IsNodeAvailable(int node) {
-  if (numa_node_size(node, NULL) > 0) {
-    return true;
-  }
-  return false;
+  return numaAPI_GetNumNodeProcessors(node) > 0;
 }
 
 int numaAPI_GetNumNodeProcessors(int node) {
@@ -235,13 +237,15 @@ bool numaAPI_RunThreadOnNode(int node) {
   struct bitmask* node_mask = numa_allocate_nodemask();
   numa_bitmask_clearall(node_mask);
   numa_bitmask_setbit(node_mask, node);
-  numa_run_on_node_mask(node_mask);
+  numa_run_on_node_mask_all(node_mask);
   // TODO(sergey): The following commands are based on x265 code, we might want
   // to make those optional, or require to call those explicitly.
   //
   // Current assumption is that this is similar to SetThreadGroupAffinity().
-  numa_set_interleave_mask(node_mask);
-  numa_set_localalloc();
+  if (numa_node_size(node, NULL) > 0) {
+    numa_set_interleave_mask(node_mask);
+    numa_set_localalloc();
+  }
 #ifdef WITH_DYNLOAD
   if (numa_free_nodemask != NULL) {
     numa_free_nodemask(node_mask);