Fix T59874: Cycles CPU 25% load only during rendering
authorSergey Sharybin <sergey.vfx@gmail.com>
Thu, 27 Dec 2018 18:01:19 +0000 (19:01 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Thu, 27 Dec 2018 18:12:59 +0000 (19:12 +0100)
The issue was introduced by a Threadripper2 commit back in
ce927e15e0e3. This boils down to threads inheriting affinity
from the parent thread. It is a question how this slipped
through the review (we definitely run benchmark round).

Quick fix could have been to always set CPU group affinity
in Cycles, and it would work for Windows. On other platforms
we did not have CPU groups API finished.

Ended up making Cycles aware of NUMA topology, so now we
bound threads to a specific NUMA node. This required adding
an external dependency to Cycles, but made some code there
shorter.

intern/cycles/util/CMakeLists.txt
intern/cycles/util/util_system.cpp
intern/cycles/util/util_system.h
intern/cycles/util/util_task.cpp
intern/cycles/util/util_thread.cpp
intern/cycles/util/util_thread.h
intern/cycles/util/util_windows.cpp [deleted file]
intern/cycles/util/util_windows.h
intern/numaapi/include/numaapi.h
intern/numaapi/source/numaapi_linux.c

index 92dfc9f..42626d0 100644 (file)
@@ -25,13 +25,17 @@ set(SRC
        util_thread.cpp
        util_time.cpp
        util_transform.cpp
-       util_windows.cpp
 )
 
-if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
-       list(APPEND SRC
-               util_view.cpp
-       )
+if(WITH_CYCLES_STANDALONE)
+       if (WITH_CYCLES_STANDALONE_GUI)
+               list(APPEND SRC
+                       util_view.cpp
+               )
+       endif()
+       list(APPEND INC_SYS ../../third_party/numaapi/include)
+else()
+       list(APPEND INC_SYS ../../numaapi/include)
 endif()
 
 set(SRC_HEADERS
index 34f428f..cc2d701 100644 (file)
@@ -20,6 +20,8 @@
 #include "util/util_types.h"
 #include "util/util_string.h"
 
+#include <numaapi.h>
+
 #ifdef _WIN32
 #  if(!defined(FREE_WINDOWS))
 #    include <intrin.h>
 
 CCL_NAMESPACE_BEGIN
 
-int system_cpu_group_count()
+bool system_cpu_ensure_initialized()
 {
-#ifdef _WIN32
-       util_windows_init_numa_groups();
-       return GetActiveProcessorGroupCount();
-#else
-       /* TODO(sergey): Need to adopt for other platforms. */
-       return 1;
-#endif
+       static bool is_initialized = false;
+       static bool result = false;
+       if (is_initialized) {
+               return result;
+       }
+       is_initialized = true;
+       const NUMAAPI_Result numa_result = numaAPI_Initialize();
+       result = (numa_result == NUMAAPI_SUCCESS);
+       return result;
 }
 
-int system_cpu_group_thread_count(int group)
+/* Fallback solution, which doesn't use NUMA/CPU groups. */
+static int system_cpu_thread_count_fallback()
 {
-       /* TODO(sergey): Need make other platforms aware of groups. */
 #ifdef _WIN32
-       util_windows_init_numa_groups();
-       return GetActiveProcessorCount(group);
+       SYSTEM_INFO info;
+       GetSystemInfo(&info);
+       return info.dwNumberOfProcessors;
 #elif defined(__APPLE__)
-       (void) group;
        int count;
        size_t len = sizeof(count);
        int mib[2] = { CTL_HW, HW_NCPU };
        sysctl(mib, 2, &count, &len, NULL, 0);
        return count;
 #else
-       (void) group;
        return sysconf(_SC_NPROCESSORS_ONLN);
 #endif
 }
 
 int system_cpu_thread_count()
 {
-       static uint count = 0;
-
-       if(count > 0) {
-               return count;
+       const int num_nodes = system_cpu_num_numa_nodes();
+       int num_threads = 0;
+       for (int node = 0; node < num_nodes; ++node) {
+               if (!system_cpu_is_numa_node_available(node)) {
+                       continue;
+               }
+               num_threads += system_cpu_num_numa_node_processors(node);
        }
+       return num_threads;
+}
 
-       int max_group = system_cpu_group_count();
-       VLOG(1) << "Detected " << max_group << " CPU groups.";
-       for(int group = 0; group < max_group; ++group) {
-               int num_threads = system_cpu_group_thread_count(group);
-               VLOG(1) << "Group " << group
-                       << " has " << num_threads << " threads.";
-               count += num_threads;
+int system_cpu_num_numa_nodes()
+{
+       if (!system_cpu_ensure_initialized()) {
+               /* Fallback to a single node with all the threads. */
+               return 1;
        }
+       return numaAPI_GetNumNodes();
+}
 
-       if(count < 1) {
-               count = 1;
+bool system_cpu_is_numa_node_available(int node)
+{
+       if (!system_cpu_ensure_initialized()) {
+               return true;
        }
+       return numaAPI_IsNodeAvailable(node);
+}
 
-       return count;
+int system_cpu_num_numa_node_processors(int node)
+{
+       if (!system_cpu_ensure_initialized()) {
+               return system_cpu_thread_count_fallback();
+       }
+       return numaAPI_GetNumNodeProcessors(node);
 }
 
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *groups)
+bool system_cpu_run_thread_on_node(int node)
 {
-#ifdef _WIN32
-       unsigned short group_count = max_groups;
-       if(!GetProcessGroupAffinity(GetCurrentProcess(), &group_count, groups)) {
-               return 0;
+       if (!system_cpu_ensure_initialized()) {
+               return true;
        }
-       return group_count;
-#else
-       (void) max_groups;
-       (void) groups;
-       return 0;
-#endif
+       return numaAPI_RunThreadOnNode(node);
 }
 
 #if !defined(_WIN32) || defined(FREE_WINDOWS)
index 241ac89..15f69bc 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-/* Get number of available CPU groups. */
-int system_cpu_group_count();
+/* Make sure CPU groups / NUMA API is initialized. */
+bool system_cpu_ensure_initialized();
 
-/* Get number of threads/processors in the specified group. */
-int system_cpu_group_thread_count(int group);
-
-/* Get total number of threads in all groups. */
+/* Get total number of threads in all NUMA nodes / CPU groups. */
 int system_cpu_thread_count();
 
-/* Get current process groups. */
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *grpups);
+/* Get number of available nodes.
+ *
+ * This is in fact an index of last node plus one and it's not guaranteed
+ * that all nodes up to this one are available. */
+int system_cpu_num_numa_nodes();
+
+/* Returns truth if the given node is available for compute. */
+bool system_cpu_is_numa_node_available(int node);
+
+/* Get number of available processors on a given node. */
+int system_cpu_num_numa_node_processors(int node);
+
+/* Runs the current thread and its children on a specific node.
+ *
+ * Returns truth if affinity has successfully changed. */
+bool system_cpu_run_thread_on_node(int node);
 
 string system_cpu_brand_string();
 int system_cpu_bits();
index 2d21d6b..50a2bb1 100644 (file)
@@ -204,50 +204,26 @@ void TaskScheduler::init(int num_threads)
                /* launch threads that will be waiting for work */
                threads.resize(num_threads);
 
-               const int num_groups = system_cpu_group_count();
-               unsigned short num_process_groups = 0;
-               vector<unsigned short> process_groups;
-               int current_group_threads = 0;
-               if(num_groups > 1) {
-                       process_groups.resize(num_groups);
-                       num_process_groups = system_cpu_process_groups(num_groups,
-                                                                      &process_groups[0]);
-                       if(num_process_groups == 1) {
-                               current_group_threads = system_cpu_group_thread_count(process_groups[0]);
-                       }
-               }
+               const int num_nodes = system_cpu_num_numa_nodes();
                int thread_index = 0;
-               for(int group = 0; group < num_groups; ++group) {
-                       /* NOTE: That's not really efficient from threading point of view,
-                        * but it is simple to read and it doesn't make sense to use more
-                        * user-specified threads than logical threads anyway.
-                        */
-                       int num_group_threads = (group == num_groups - 1)
-                               ? (threads.size() - thread_index)
-                               : system_cpu_group_thread_count(group);
-                       for(int group_thread = 0;
-                               group_thread < num_group_threads && thread_index < threads.size();
-                               ++group_thread, ++thread_index)
+               for (int node = 0;
+                    node < num_nodes && thread_index < threads.size();
+                    ++node)
+               {
+                       if (!system_cpu_is_numa_node_available(node)) {
+                               continue;
+                       }
+                       const int num_node_processors =
+                               system_cpu_num_numa_node_processors(node);
+                       for (int i = 0;
+                            i < num_node_processors && thread_index < threads.size();
+                            ++i)
                        {
-                               /* NOTE: Thread group of -1 means we would not force thread affinity. */
-                               int thread_group;
-                               if(num_groups == 1) {
-                                       /* Use default affinity if there's only one CPU group in the system. */
-                                       thread_group = -1;
-                               }
-                               else if(use_auto_threads &&
-                                       num_process_groups == 1 &&
-                                               num_threads <= current_group_threads)
-                               {
-                                       /* If we fit into curent CPU group we also don't force any affinity. */
-                                       thread_group = -1;
-                               }
-                               else {
-                                       thread_group = group;
-                               }
-                               threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run,
-                                                                                thread_index + 1),
-                                                                  thread_group);
+                               threads[thread_index] = new thread(
+                                       function_bind(&TaskScheduler::thread_run,
+                                                     thread_index + 1),
+                                       node);
+                               thread_index++;
                        }
                }
        }
index 37d8bdb..4d30e3f 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-thread::thread(function<void()> run_cb, int group)
+thread::thread(function<void()> run_cb, int node)
   : run_cb_(run_cb),
     joined_(false),
-       group_(group)
+       node_(node)
 {
        thread_ = std::thread(&thread::run, this);
 }
@@ -39,19 +39,8 @@ thread::~thread()
 void *thread::run(void *arg)
 {
        thread *self = (thread*)(arg);
-       if(self->group_ != -1) {
-#ifdef _WIN32
-               HANDLE thread_handle = GetCurrentThread();
-               GROUP_AFFINITY group_affinity = { 0 };
-               int num_threads = system_cpu_group_thread_count(self->group_);
-               group_affinity.Group = self->group_;
-               group_affinity.Mask = (num_threads == 64)
-                                             ? -1
-                                             :  (1ull << num_threads) - 1;
-               if(SetThreadGroupAffinity(thread_handle, &group_affinity, NULL) == 0) {
-                       fprintf(stderr, "Error setting thread affinity.\n");
-               }
-#endif
+       if (self->node_ != -1) {
+               system_cpu_run_thread_on_node(self->node_);
        }
        self->run_cb_();
        return NULL;
index 6250bb9..d54199a 100644 (file)
@@ -46,7 +46,7 @@ typedef std::condition_variable thread_condition_variable;
 
 class thread {
 public:
-       thread(function<void()> run_cb, int group = -1);
+       thread(function<void()> run_cb, int node = -1);
        ~thread();
 
        static void *run(void *arg);
@@ -56,7 +56,7 @@ protected:
        function<void()> run_cb_;
        std::thread thread_;
        bool joined_;
-       int group_;
+       int node_;
 };
 
 /* Own wrapper around pthread's spin lock to make it's use easier. */
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
deleted file mode 100644 (file)
index 073db2a..0000000
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_windows.h"
-
-#ifdef _WIN32
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef _M_X64
-#  include <VersionHelpers.h>
-#endif
-
-#if _WIN32_WINNT < 0x0601
-tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
-tGetActiveProcessorCount *GetActiveProcessorCount;
-tSetThreadGroupAffinity *SetThreadGroupAffinity;
-tGetProcessGroupAffinity *GetProcessGroupAffinity;
-#endif
-
-static WORD GetActiveProcessorGroupCount_stub()
-{
-       return 1;
-}
-
-static DWORD GetActiveProcessorCount_stub(WORD /*GroupNumber*/)
-{
-       SYSTEM_INFO info;
-       GetSystemInfo(&info);
-       return info.dwNumberOfProcessors;
-}
-
-static BOOL SetThreadGroupAffinity_stub(
-        HANDLE /*hThread*/,
-        const GROUP_AFFINITY  * /*GroupAffinity*/,
-        PGROUP_AFFINITY /*PreviousGroupAffinity*/)
-{
-       return TRUE;
-}
-
-static BOOL GetProcessGroupAffinity_stub(HANDLE hProcess,
-                                         PUSHORT GroupCount,
-                                         PUSHORT GroupArray)
-{
-       if(*GroupCount < 1) {
-               return FALSE;
-       }
-       *GroupCount = 1;
-       GroupArray[0] = 0;
-       return TRUE;
-}
-
-static bool supports_numa()
-{
-#ifndef _M_X64
-       return false;
-#else
-       return IsWindows7OrGreater();
-#endif
-}
-
-void util_windows_init_numa_groups()
-{
-       static bool initialized = false;
-       if(initialized) {
-               return;
-       }
-       initialized = true;
-#if _WIN32_WINNT < 0x0601
-       if(!supports_numa()) {
-               /* Use stubs on platforms which doesn't have rean NUMA/Groups. */
-               GetActiveProcessorGroupCount = GetActiveProcessorGroupCount_stub;
-               GetActiveProcessorCount = GetActiveProcessorCount_stub;
-               SetThreadGroupAffinity = SetThreadGroupAffinity_stub;
-               GetProcessGroupAffinity = GetProcessGroupAffinity_stub;
-               return;
-       }
-       HMODULE kernel = GetModuleHandleA("kernel32.dll");
-#  define READ_SYMBOL(sym) sym = (t##sym*)GetProcAddress(kernel, #sym)
-       READ_SYMBOL(GetActiveProcessorGroupCount);
-       READ_SYMBOL(GetActiveProcessorCount);
-       READ_SYMBOL(SetThreadGroupAffinity);
-       READ_SYMBOL(GetProcessGroupAffinity);
-#  undef READ_SUMBOL
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif  /* _WIN32 */
index 9b9268f..bd1bc85 100644 (file)
 
 #include <windows.h>
 
-CCL_NAMESPACE_BEGIN
-
-#if _WIN32_WINNT < 0x0601
-typedef WORD tGetActiveProcessorGroupCount();
-typedef DWORD tGetActiveProcessorCount(WORD GroupNumber);
-typedef BOOL tSetThreadGroupAffinity(HANDLE hThread,
-                                     const GROUP_AFFINITY  *GroupAffinity,
-                                     PGROUP_AFFINITY PreviousGroupAffinity);
-typedef BOOL tGetProcessGroupAffinity(HANDLE  hProcess,
-                                     PUSHORT GroupCount,
-                                     PUSHORT GroupArray);
-
-extern tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
-extern tGetActiveProcessorCount *GetActiveProcessorCount;
-extern tSetThreadGroupAffinity *SetThreadGroupAffinity;
-extern tGetProcessGroupAffinity *GetProcessGroupAffinity;
-#endif
-
-/* Make sure NUMA and processor groups API is initialized. */
-void util_windows_init_numa_groups();
-
-CCL_NAMESPACE_END
-
-#endif  /* WIN32 */
+#endif  /* _WIN32 */
 
 #endif  /* __UTIL_WINDOWS_H__ */
index a4f32d8..7b5b50f 100644 (file)
@@ -67,7 +67,7 @@ int numaAPI_GetNumNodes(void);
 // Returns truth if the given node is available for compute.
 bool numaAPI_IsNodeAvailable(int node);
 
-// Getnumber of available processors on a given node.
+// Get number of available processors on a given node.
 int numaAPI_GetNumNodeProcessors(int node);
 
 ////////////////////////////////////////////////////////////////////////////////
index 559e97b..62e9dcd 100644 (file)
@@ -34,6 +34,8 @@
 #  include <dlfcn.h>
 #endif
 
+#include <stdio.h>
+
 #ifdef WITH_DYNLOAD
 
 // Descriptor numa library.
@@ -61,6 +63,7 @@ typedef struct bitmask* tnuma_allocate_nodemask(void);
 typedef void tnuma_free_cpumask(struct bitmask* bitmask);
 typedef void tnuma_free_nodemask(struct bitmask* bitmask);
 typedef int tnuma_run_on_node_mask(struct bitmask *nodemask);
+typedef int tnuma_run_on_node_mask_all(struct bitmask *nodemask);
 typedef void tnuma_set_interleave_mask(struct bitmask *nodemask);
 typedef void tnuma_set_localalloc(void);
 
@@ -83,6 +86,7 @@ static tnuma_allocate_nodemask* numa_allocate_nodemask;
 static tnuma_free_nodemask* numa_free_nodemask;
 static tnuma_free_cpumask* numa_free_cpumask;
 static tnuma_run_on_node_mask* numa_run_on_node_mask;
+static tnuma_run_on_node_mask_all* numa_run_on_node_mask_all;
 static tnuma_set_interleave_mask* numa_set_interleave_mask;
 static tnuma_set_localalloc* numa_set_localalloc;
 
@@ -157,6 +161,7 @@ static NUMAAPI_Result loadNumaSymbols(void) {
   NUMA_LIBRARY_FIND(numa_free_cpumask);
   NUMA_LIBRARY_FIND(numa_free_nodemask);
   NUMA_LIBRARY_FIND(numa_run_on_node_mask);
+  NUMA_LIBRARY_FIND(numa_run_on_node_mask_all);
   NUMA_LIBRARY_FIND(numa_set_interleave_mask);
   NUMA_LIBRARY_FIND(numa_set_localalloc);
 
@@ -192,10 +197,7 @@ int numaAPI_GetNumNodes(void) {
 }
 
 bool numaAPI_IsNodeAvailable(int node) {
-  if (numa_node_size(node, NULL) > 0) {
-    return true;
-  }
-  return false;
+  return numaAPI_GetNumNodeProcessors(node) > 0;
 }
 
 int numaAPI_GetNumNodeProcessors(int node) {
@@ -235,13 +237,15 @@ bool numaAPI_RunThreadOnNode(int node) {
   struct bitmask* node_mask = numa_allocate_nodemask();
   numa_bitmask_clearall(node_mask);
   numa_bitmask_setbit(node_mask, node);
-  numa_run_on_node_mask(node_mask);
+  numa_run_on_node_mask_all(node_mask);
   // TODO(sergey): The following commands are based on x265 code, we might want
   // to make those optional, or require to call those explicitly.
   //
   // Current assumption is that this is similar to SetThreadGroupAffinity().
-  numa_set_interleave_mask(node_mask);
-  numa_set_localalloc();
+  if (numa_node_size(node, NULL) > 0) {
+    numa_set_interleave_mask(node_mask);
+    numa_set_localalloc();
+  }
 #ifdef WITH_DYNLOAD
   if (numa_free_nodemask != NULL) {
     numa_free_nodemask(node_mask);