Merge branch 'blender2.7'
authorSergey Sharybin <sergey.vfx@gmail.com>
Fri, 11 Jan 2019 17:09:05 +0000 (18:09 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Fri, 11 Jan 2019 17:09:05 +0000 (18:09 +0100)
intern/cycles/util/util_system.cpp
intern/cycles/util/util_system.h
intern/cycles/util/util_task.cpp
intern/numaapi/README.blender
intern/numaapi/include/numaapi.h
intern/numaapi/source/numaapi_linux.c
intern/numaapi/source/numaapi_stub.c
intern/numaapi/source/numaapi_win32.c
release/scripts/addons

index cc2d7017fd8548ecda21834d92e60159e9d5a839..a22bd25ce77d79c99ae155c4c0500fec1b51799f 100644 (file)
@@ -40,7 +40,7 @@ bool system_cpu_ensure_initialized()
 {
        static bool is_initialized = false;
        static bool result = false;
-       if (is_initialized) {
+       if(is_initialized) {
                return result;
        }
        is_initialized = true;
@@ -71,8 +71,8 @@ int system_cpu_thread_count()
 {
        const int num_nodes = system_cpu_num_numa_nodes();
        int num_threads = 0;
-       for (int node = 0; node < num_nodes; ++node) {
-               if (!system_cpu_is_numa_node_available(node)) {
+       for(int node = 0; node < num_nodes; ++node) {
+               if(!system_cpu_is_numa_node_available(node)) {
                        continue;
                }
                num_threads += system_cpu_num_numa_node_processors(node);
@@ -82,7 +82,7 @@ int system_cpu_thread_count()
 
 int system_cpu_num_numa_nodes()
 {
-       if (!system_cpu_ensure_initialized()) {
+       if(!system_cpu_ensure_initialized()) {
                /* Fallback to a single node with all the threads. */
                return 1;
        }
@@ -91,7 +91,7 @@ int system_cpu_num_numa_nodes()
 
 bool system_cpu_is_numa_node_available(int node)
 {
-       if (!system_cpu_ensure_initialized()) {
+       if(!system_cpu_ensure_initialized()) {
                return true;
        }
        return numaAPI_IsNodeAvailable(node);
@@ -99,7 +99,7 @@ bool system_cpu_is_numa_node_available(int node)
 
 int system_cpu_num_numa_node_processors(int node)
 {
-       if (!system_cpu_ensure_initialized()) {
+       if(!system_cpu_ensure_initialized()) {
                return system_cpu_thread_count_fallback();
        }
        return numaAPI_GetNumNodeProcessors(node);
@@ -107,12 +107,20 @@ int system_cpu_num_numa_node_processors(int node)
 
 bool system_cpu_run_thread_on_node(int node)
 {
-       if (!system_cpu_ensure_initialized()) {
+       if(!system_cpu_ensure_initialized()) {
                return true;
        }
        return numaAPI_RunThreadOnNode(node);
 }
 
+int system_cpu_num_active_group_processors()
+{
+       if(!system_cpu_ensure_initialized()) {
+               return system_cpu_thread_count_fallback();
+       }
+       return numaAPI_GetNumCurrentNodesProcessors();
+}
+
 #if !defined(_WIN32) || defined(FREE_WINDOWS)
 static void __cpuid(int data[4], int selector)
 {
index 15f69bcf15354a4f485a1cdf45fafefb7919b585..0c001f11f0e97f607af11e36a10f12a04a102291 100644 (file)
@@ -44,6 +44,10 @@ int system_cpu_num_numa_node_processors(int node);
  * Returns truth if affinity has successfully changed. */
 bool system_cpu_run_thread_on_node(int node);
 
+/* Number of processors within the current CPU group (or within active thread
+ * thread affinity). */
+int system_cpu_num_active_group_processors();
+
 string system_cpu_brand_string();
 int system_cpu_bits();
 bool system_cpu_support_sse2();
index 7e9f7313fba86c31f235a0323e74e41c1a6afef6..4241c4aa8ccdda83f26c8d0dde983ea1bcda2e20 100644 (file)
@@ -228,9 +228,21 @@ int get_num_total_processors(const vector<int>& num_per_node_processors)
 void distribute_threads_on_nodes(const vector<thread*>& threads)
 {
        const int num_threads = threads.size();
-       /* TODO(sergey): Skip overriding affinity if threads fits into the current
-        * nodes/CPU group. This will allow user to tweak affinity for weird and
-        * wonderful reasons. */
+       const int num_active_group_processors =
+               system_cpu_num_active_group_processors();
+       VLOG(1) << "Detected " << num_active_group_processors << " processors "
+               << "in active group.";
+       if(num_active_group_processors >= num_threads) {
+               /* If the current thread is set up in a way that its affinity allows to
+                * use at least requested number of threads we do not explicitly set
+                * affinity to the worker therads.
+                * This way we allow users to manually edit affinity of the parent
+                * thread, and here we follow that affinity. This way it's possible to
+                * have two Cycles/Blender instances running manually set to a different
+                * dies on a CPU. */
+               VLOG(1) << "Not setting thread group affinity.";
+               return;
+       }
        vector<int> num_per_node_processors;
        get_per_node_num_processors(&num_per_node_processors);
        if(num_per_node_processors.size() == 0) {
index 661073712b90b46b08a69e3d39a361f5f1e54c32..6f71d5f88079d9c8af2f7eae97f9b9993bb1a464 100644 (file)
@@ -1,5 +1,5 @@
 Project: LibNumaAPI
 URL: https://github.com/Nazg-Gul/libNumaAPI
 License: MIT
-Upstream version: f83d41ec4d7
+Upstream version: 4e7206befce
 Local modifications: None
index 7b5b50fdf39ba3e9df12cf2d1d134b29ef66e31e..bddb51448f862ad9ad37aab1af82459ab07d20ef 100644 (file)
@@ -70,6 +70,16 @@ bool numaAPI_IsNodeAvailable(int node);
 // Get number of available processors on a given node.
 int numaAPI_GetNumNodeProcessors(int node);
 
+////////////////////////////////////////////////////////////////////////////////
+// Topology helpers.
+//
+// Those are a bit higher level queries, but is still rather platform-specific
+// and generally useful.
+
+// Get number of processors within the NUMA nodes on which current thread is
+// set affinity on.
+int numaAPI_GetNumCurrentNodesProcessors(void);
+
 ////////////////////////////////////////////////////////////////////////////////
 // Affinities.
 
index 62e9dcdfadfee3625279156ef86b5de0306a7886..9750f1c17df9455fb88b943c0d18c8aa78fa136c 100644 (file)
@@ -34,8 +34,6 @@
 #  include <dlfcn.h>
 #endif
 
-#include <stdio.h>
-
 #ifdef WITH_DYNLOAD
 
 // Descriptor numa library.
@@ -64,6 +62,7 @@ typedef void tnuma_free_cpumask(struct bitmask* bitmask);
 typedef void tnuma_free_nodemask(struct bitmask* bitmask);
 typedef int tnuma_run_on_node_mask(struct bitmask *nodemask);
 typedef int tnuma_run_on_node_mask_all(struct bitmask *nodemask);
+typedef struct bitmask *tnuma_get_run_node_mask(void);
 typedef void tnuma_set_interleave_mask(struct bitmask *nodemask);
 typedef void tnuma_set_localalloc(void);
 
@@ -87,6 +86,7 @@ static tnuma_free_nodemask* numa_free_nodemask;
 static tnuma_free_cpumask* numa_free_cpumask;
 static tnuma_run_on_node_mask* numa_run_on_node_mask;
 static tnuma_run_on_node_mask_all* numa_run_on_node_mask_all;
+static tnuma_get_run_node_mask* numa_get_run_node_mask;
 static tnuma_set_interleave_mask* numa_set_interleave_mask;
 static tnuma_set_localalloc* numa_set_localalloc;
 
@@ -162,6 +162,7 @@ static NUMAAPI_Result loadNumaSymbols(void) {
   NUMA_LIBRARY_FIND(numa_free_nodemask);
   NUMA_LIBRARY_FIND(numa_run_on_node_mask);
   NUMA_LIBRARY_FIND(numa_run_on_node_mask_all);
+  NUMA_LIBRARY_FIND(numa_get_run_node_mask);
   NUMA_LIBRARY_FIND(numa_set_interleave_mask);
   NUMA_LIBRARY_FIND(numa_set_localalloc);
 
@@ -204,7 +205,7 @@ int numaAPI_GetNumNodeProcessors(int node) {
   struct bitmask* cpu_mask = numa_allocate_cpumask();
   numa_node_to_cpus(node, cpu_mask);
   const unsigned int num_bytes = numa_bitmask_nbytes(cpu_mask);
-  const unsigned int num_bits = num_bytes  *8;
+  const unsigned int num_bits = num_bytes 8;
   // TODO(sergey): There might be faster way calculating number of set bits.
   int num_processors = 0;
   for (unsigned int bit = 0; bit < num_bits; ++bit) {
@@ -224,6 +225,23 @@ int numaAPI_GetNumNodeProcessors(int node) {
   return num_processors;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Topology helpers.
+
+int numaAPI_GetNumCurrentNodesProcessors(void) {
+  struct bitmask* node_mask = numa_get_run_node_mask();
+  const unsigned int num_bytes = numa_bitmask_nbytes(node_mask);
+  const unsigned int num_bits = num_bytes * 8;
+  int num_processors = 0;
+  for (unsigned int bit = 0; bit < num_bits; ++bit) {
+    if (numa_bitmask_isbitset(node_mask, bit)) {
+      num_processors += numaAPI_GetNumNodeProcessors(bit);
+    }
+  }
+  numa_bitmask_free(node_mask);
+  return num_processors;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Affinities.
 
index e054d71018cf2ace1ae1d2497860b7bfe3a4307b..6ac41136c8f668dfb665e9a123d1943b6f9285d8 100644 (file)
@@ -52,6 +52,13 @@ int numaAPI_GetNumNodeProcessors(int node) {
   return 0;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Topology helpers.
+
+int numaAPI_GetNumCurrentNodesProcessors(void) {
+  return 0;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Affinities.
 
index 33cbc797bd03100733deaee2ffda8a480eb76a4d..e278ef612fdec74665cf8900162d851d51f0567d 100644 (file)
@@ -47,8 +47,6 @@
 #  include <VersionHelpers.h>
 #endif
 
-#include <stdio.h>
-
 ////////////////////////////////////////////////////////////////////////////////
 // Initialization.
 
@@ -74,9 +72,14 @@ typedef BOOL t_VirtualFree(void* address, SIZE_T size, DWORD free_type);
 typedef BOOL t_SetProcessAffinityMask(HANDLE process_handle,
                                       DWORD_PTR process_affinity_mask);
 typedef BOOL t_SetThreadGroupAffinity(HANDLE thread_handle,
-                                      const GROUP_AFFINITY* GroupAffinity,
+                                      const GROUP_AFFINITY* group_affinity,
                                       GROUP_AFFINITY* PreviousGroupAffinity);
+typedef BOOL t_GetThreadGroupAffinity(HANDLE thread_handle,
+                                      GROUP_AFFINITY* group_affinity);
 typedef DWORD t_GetCurrentProcessorNumber(void);
+typedef void t_GetCurrentProcessorNumberEx(PROCESSOR_NUMBER* proc_number);
+typedef DWORD t_GetActiveProcessorCount(WORD group_number);
+
 
 // NUMA symbols.
 static t_GetNumaHighestNodeNumber* _GetNumaHighestNodeNumber;
@@ -88,7 +91,10 @@ static t_VirtualFree* _VirtualFree;
 // Threading symbols.
 static t_SetProcessAffinityMask* _SetProcessAffinityMask;
 static t_SetThreadGroupAffinity* _SetThreadGroupAffinity;
+static t_GetThreadGroupAffinity* _GetThreadGroupAffinity;
 static t_GetCurrentProcessorNumber* _GetCurrentProcessorNumber;
+static t_GetCurrentProcessorNumberEx* _GetCurrentProcessorNumberEx;
+static t_GetActiveProcessorCount* _GetActiveProcessorCount;
 
 static void numaExit(void) {
   // TODO(sergey): Consider closing library here.
@@ -128,7 +134,10 @@ static NUMAAPI_Result loadNumaSymbols(void) {
   // Threading.
   KERNEL_LIBRARY_FIND(SetProcessAffinityMask);
   KERNEL_LIBRARY_FIND(SetThreadGroupAffinity);
+  KERNEL_LIBRARY_FIND(GetThreadGroupAffinity);
   KERNEL_LIBRARY_FIND(GetCurrentProcessorNumber);
+  KERNEL_LIBRARY_FIND(GetCurrentProcessorNumberEx);
+  KERNEL_LIBRARY_FIND(GetActiveProcessorCount);
 
 #undef KERNEL_LIBRARY_FIND
 #undef _LIBRARY_FIND
@@ -151,6 +160,19 @@ NUMAAPI_Result numaAPI_Initialize(void) {
 #endif
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Internal helpers.
+
+static int countNumSetBits(int64_t mask) {
+  // TODO(sergey): There might be faster way calculating number of set bits.
+  int num_bits = 0;
+  while (mask != 0) {
+    num_bits += (mask & 1);
+    mask = (mask >> 1);
+  }
+  return num_bits;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Topology query.
 
@@ -185,11 +207,26 @@ int numaAPI_GetNumNodeProcessors(int node) {
   if (!_GetNumaNodeProcessorMask(node, &processor_mask)) {
     return 0;
   }
-  // TODO(sergey): There might be faster way calculating number of set bits.
-  int num_processors = 0;
-  while (processor_mask != 0) {
-    num_processors += (processor_mask & 1);
-    processor_mask = (processor_mask >> 1);
+  return countNumSetBits(processor_mask);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Topology helpers.
+
+int numaAPI_GetNumCurrentNodesProcessors(void) {
+  HANDLE thread_handle = GetCurrentThread();
+  GROUP_AFFINITY group_affinity;
+  // TODO(sergey): Needs implementation.
+  if (!_GetThreadGroupAffinity(thread_handle, &group_affinity)) {
+    return 0;
+  }
+  // First, count number of possible bits in the affinity mask.
+  const int num_processors = countNumSetBits(group_affinity.Mask);
+  // Then check that it's not exceeding number of processors in tjhe group.
+  const int num_group_processors =
+      _GetActiveProcessorCount(group_affinity.Group);
+  if (num_group_processors < num_processors) {
+    return num_group_processors;
   }
   return num_processors;
 }
index d31844cc0b3be397b5032cb0c5d8daea39584783..46a9160c6f67d60610fdcc1dadbe3946a7010625 160000 (submodule)
@@ -1 +1 @@
-Subproject commit d31844cc0b3be397b5032cb0c5d8daea39584783
+Subproject commit 46a9160c6f67d60610fdcc1dadbe3946a7010625