Merge branch 'blender2.7'
authorSergey Sharybin <sergey.vfx@gmail.com>
Thu, 27 Dec 2018 18:13:27 +0000 (19:13 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Thu, 27 Dec 2018 18:13:27 +0000 (19:13 +0100)
intern/cycles/util/CMakeLists.txt
intern/cycles/util/util_system.cpp
intern/cycles/util/util_system.h
intern/cycles/util/util_task.cpp
intern/cycles/util/util_thread.cpp
intern/cycles/util/util_thread.h
intern/cycles/util/util_windows.cpp [deleted file]
intern/cycles/util/util_windows.h
intern/numaapi/include/numaapi.h
intern/numaapi/source/numaapi_linux.c
source/blender/blenkernel/intern/font.c

index 92dfc9f..42626d0 100644 (file)
@@ -25,13 +25,17 @@ set(SRC
        util_thread.cpp
        util_time.cpp
        util_transform.cpp
-       util_windows.cpp
 )
 
-if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
-       list(APPEND SRC
-               util_view.cpp
-       )
+if(WITH_CYCLES_STANDALONE)
+       if (WITH_CYCLES_STANDALONE_GUI)
+               list(APPEND SRC
+                       util_view.cpp
+               )
+       endif()
+       list(APPEND INC_SYS ../../third_party/numaapi/include)
+else()
+       list(APPEND INC_SYS ../../numaapi/include)
 endif()
 
 set(SRC_HEADERS
index 34f428f..cc2d701 100644 (file)
@@ -20,6 +20,8 @@
 #include "util/util_types.h"
 #include "util/util_string.h"
 
+#include <numaapi.h>
+
 #ifdef _WIN32
 #  if(!defined(FREE_WINDOWS))
 #    include <intrin.h>
 
 CCL_NAMESPACE_BEGIN
 
-int system_cpu_group_count()
+bool system_cpu_ensure_initialized()
 {
-#ifdef _WIN32
-       util_windows_init_numa_groups();
-       return GetActiveProcessorGroupCount();
-#else
-       /* TODO(sergey): Need to adopt for other platforms. */
-       return 1;
-#endif
+       static bool is_initialized = false;
+       static bool result = false;
+       if (is_initialized) {
+               return result;
+       }
+       is_initialized = true;
+       const NUMAAPI_Result numa_result = numaAPI_Initialize();
+       result = (numa_result == NUMAAPI_SUCCESS);
+       return result;
 }
 
-int system_cpu_group_thread_count(int group)
+/* Fallback solution, which doesn't use NUMA/CPU groups. */
+static int system_cpu_thread_count_fallback()
 {
-       /* TODO(sergey): Need make other platforms aware of groups. */
 #ifdef _WIN32
-       util_windows_init_numa_groups();
-       return GetActiveProcessorCount(group);
+       SYSTEM_INFO info;
+       GetSystemInfo(&info);
+       return info.dwNumberOfProcessors;
 #elif defined(__APPLE__)
-       (void) group;
        int count;
        size_t len = sizeof(count);
        int mib[2] = { CTL_HW, HW_NCPU };
        sysctl(mib, 2, &count, &len, NULL, 0);
        return count;
 #else
-       (void) group;
        return sysconf(_SC_NPROCESSORS_ONLN);
 #endif
 }
 
 int system_cpu_thread_count()
 {
-       static uint count = 0;
-
-       if(count > 0) {
-               return count;
+       const int num_nodes = system_cpu_num_numa_nodes();
+       int num_threads = 0;
+       for (int node = 0; node < num_nodes; ++node) {
+               if (!system_cpu_is_numa_node_available(node)) {
+                       continue;
+               }
+               num_threads += system_cpu_num_numa_node_processors(node);
        }
+       return num_threads;
+}
 
-       int max_group = system_cpu_group_count();
-       VLOG(1) << "Detected " << max_group << " CPU groups.";
-       for(int group = 0; group < max_group; ++group) {
-               int num_threads = system_cpu_group_thread_count(group);
-               VLOG(1) << "Group " << group
-                       << " has " << num_threads << " threads.";
-               count += num_threads;
+int system_cpu_num_numa_nodes()
+{
+       if (!system_cpu_ensure_initialized()) {
+               /* Fallback to a single node with all the threads. */
+               return 1;
        }
+       return numaAPI_GetNumNodes();
+}
 
-       if(count < 1) {
-               count = 1;
+bool system_cpu_is_numa_node_available(int node)
+{
+       if (!system_cpu_ensure_initialized()) {
+               return true;
        }
+       return numaAPI_IsNodeAvailable(node);
+}
 
-       return count;
+int system_cpu_num_numa_node_processors(int node)
+{
+       if (!system_cpu_ensure_initialized()) {
+               return system_cpu_thread_count_fallback();
+       }
+       return numaAPI_GetNumNodeProcessors(node);
 }
 
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *groups)
+bool system_cpu_run_thread_on_node(int node)
 {
-#ifdef _WIN32
-       unsigned short group_count = max_groups;
-       if(!GetProcessGroupAffinity(GetCurrentProcess(), &group_count, groups)) {
-               return 0;
+       if (!system_cpu_ensure_initialized()) {
+               return true;
        }
-       return group_count;
-#else
-       (void) max_groups;
-       (void) groups;
-       return 0;
-#endif
+       return numaAPI_RunThreadOnNode(node);
 }
 
 #if !defined(_WIN32) || defined(FREE_WINDOWS)
index 241ac89..15f69bc 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-/* Get number of available CPU groups. */
-int system_cpu_group_count();
+/* Make sure CPU groups / NUMA API is initialized. */
+bool system_cpu_ensure_initialized();
 
-/* Get number of threads/processors in the specified group. */
-int system_cpu_group_thread_count(int group);
-
-/* Get total number of threads in all groups. */
+/* Get total number of threads in all NUMA nodes / CPU groups. */
 int system_cpu_thread_count();
 
-/* Get current process groups. */
-unsigned short system_cpu_process_groups(unsigned short max_groups,
-                                         unsigned short *grpups);
+/* Get number of available nodes.
+ *
+ * This is in fact an index of last node plus one and it's not guaranteed
+ * that all nodes up to this one are available. */
+int system_cpu_num_numa_nodes();
+
+/* Returns truth if the given node is available for compute. */
+bool system_cpu_is_numa_node_available(int node);
+
+/* Get number of available processors on a given node. */
+int system_cpu_num_numa_node_processors(int node);
+
+/* Runs the current thread and its children on a specific node.
+ *
+ * Returns truth if affinity has successfully changed. */
+bool system_cpu_run_thread_on_node(int node);
 
 string system_cpu_brand_string();
 int system_cpu_bits();
index 2d21d6b..50a2bb1 100644 (file)
@@ -204,50 +204,26 @@ void TaskScheduler::init(int num_threads)
                /* launch threads that will be waiting for work */
                threads.resize(num_threads);
 
-               const int num_groups = system_cpu_group_count();
-               unsigned short num_process_groups = 0;
-               vector<unsigned short> process_groups;
-               int current_group_threads = 0;
-               if(num_groups > 1) {
-                       process_groups.resize(num_groups);
-                       num_process_groups = system_cpu_process_groups(num_groups,
-                                                                      &process_groups[0]);
-                       if(num_process_groups == 1) {
-                               current_group_threads = system_cpu_group_thread_count(process_groups[0]);
-                       }
-               }
+               const int num_nodes = system_cpu_num_numa_nodes();
                int thread_index = 0;
-               for(int group = 0; group < num_groups; ++group) {
-                       /* NOTE: That's not really efficient from threading point of view,
-                        * but it is simple to read and it doesn't make sense to use more
-                        * user-specified threads than logical threads anyway.
-                        */
-                       int num_group_threads = (group == num_groups - 1)
-                               ? (threads.size() - thread_index)
-                               : system_cpu_group_thread_count(group);
-                       for(int group_thread = 0;
-                               group_thread < num_group_threads && thread_index < threads.size();
-                               ++group_thread, ++thread_index)
+               for (int node = 0;
+                    node < num_nodes && thread_index < threads.size();
+                    ++node)
+               {
+                       if (!system_cpu_is_numa_node_available(node)) {
+                               continue;
+                       }
+                       const int num_node_processors =
+                               system_cpu_num_numa_node_processors(node);
+                       for (int i = 0;
+                            i < num_node_processors && thread_index < threads.size();
+                            ++i)
                        {
-                               /* NOTE: Thread group of -1 means we would not force thread affinity. */
-                               int thread_group;
-                               if(num_groups == 1) {
-                                       /* Use default affinity if there's only one CPU group in the system. */
-                                       thread_group = -1;
-                               }
-                               else if(use_auto_threads &&
-                                       num_process_groups == 1 &&
-                                               num_threads <= current_group_threads)
-                               {
-                                       /* If we fit into curent CPU group we also don't force any affinity. */
-                                       thread_group = -1;
-                               }
-                               else {
-                                       thread_group = group;
-                               }
-                               threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run,
-                                                                                thread_index + 1),
-                                                                  thread_group);
+                               threads[thread_index] = new thread(
+                                       function_bind(&TaskScheduler::thread_run,
+                                                     thread_index + 1),
+                                       node);
+                               thread_index++;
                        }
                }
        }
index 37d8bdb..4d30e3f 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-thread::thread(function<void()> run_cb, int group)
+thread::thread(function<void()> run_cb, int node)
   : run_cb_(run_cb),
     joined_(false),
-       group_(group)
+       node_(node)
 {
        thread_ = std::thread(&thread::run, this);
 }
@@ -39,19 +39,8 @@ thread::~thread()
 void *thread::run(void *arg)
 {
        thread *self = (thread*)(arg);
-       if(self->group_ != -1) {
-#ifdef _WIN32
-               HANDLE thread_handle = GetCurrentThread();
-               GROUP_AFFINITY group_affinity = { 0 };
-               int num_threads = system_cpu_group_thread_count(self->group_);
-               group_affinity.Group = self->group_;
-               group_affinity.Mask = (num_threads == 64)
-                                             ? -1
-                                             :  (1ull << num_threads) - 1;
-               if(SetThreadGroupAffinity(thread_handle, &group_affinity, NULL) == 0) {
-                       fprintf(stderr, "Error setting thread affinity.\n");
-               }
-#endif
+       if (self->node_ != -1) {
+               system_cpu_run_thread_on_node(self->node_);
        }
        self->run_cb_();
        return NULL;
index 6250bb9..d54199a 100644 (file)
@@ -46,7 +46,7 @@ typedef std::condition_variable thread_condition_variable;
 
 class thread {
 public:
-       thread(function<void()> run_cb, int group = -1);
+       thread(function<void()> run_cb, int node = -1);
        ~thread();
 
        static void *run(void *arg);
@@ -56,7 +56,7 @@ protected:
        function<void()> run_cb_;
        std::thread thread_;
        bool joined_;
-       int group_;
+       int node_;
 };
 
 /* Own wrapper around pthread's spin lock to make it's use easier. */
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
deleted file mode 100644 (file)
index 073db2a..0000000
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_windows.h"
-
-#ifdef _WIN32
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef _M_X64
-#  include <VersionHelpers.h>
-#endif
-
-#if _WIN32_WINNT < 0x0601
-tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
-tGetActiveProcessorCount *GetActiveProcessorCount;
-tSetThreadGroupAffinity *SetThreadGroupAffinity;
-tGetProcessGroupAffinity *GetProcessGroupAffinity;
-#endif
-
-static WORD GetActiveProcessorGroupCount_stub()
-{
-       return 1;
-}
-
-static DWORD GetActiveProcessorCount_stub(WORD /*GroupNumber*/)
-{
-       SYSTEM_INFO info;
-       GetSystemInfo(&info);
-       return info.dwNumberOfProcessors;
-}
-
-static BOOL SetThreadGroupAffinity_stub(
-        HANDLE /*hThread*/,
-        const GROUP_AFFINITY  * /*GroupAffinity*/,
-        PGROUP_AFFINITY /*PreviousGroupAffinity*/)
-{
-       return TRUE;
-}
-
-static BOOL GetProcessGroupAffinity_stub(HANDLE hProcess,
-                                         PUSHORT GroupCount,
-                                         PUSHORT GroupArray)
-{
-       if(*GroupCount < 1) {
-               return FALSE;
-       }
-       *GroupCount = 1;
-       GroupArray[0] = 0;
-       return TRUE;
-}
-
-static bool supports_numa()
-{
-#ifndef _M_X64
-       return false;
-#else
-       return IsWindows7OrGreater();
-#endif
-}
-
-void util_windows_init_numa_groups()
-{
-       static bool initialized = false;
-       if(initialized) {
-               return;
-       }
-       initialized = true;
-#if _WIN32_WINNT < 0x0601
-       if(!supports_numa()) {
-               /* Use stubs on platforms which doesn't have rean NUMA/Groups. */
-               GetActiveProcessorGroupCount = GetActiveProcessorGroupCount_stub;
-               GetActiveProcessorCount = GetActiveProcessorCount_stub;
-               SetThreadGroupAffinity = SetThreadGroupAffinity_stub;
-               GetProcessGroupAffinity = GetProcessGroupAffinity_stub;
-               return;
-       }
-       HMODULE kernel = GetModuleHandleA("kernel32.dll");
-#  define READ_SYMBOL(sym) sym = (t##sym*)GetProcAddress(kernel, #sym)
-       READ_SYMBOL(GetActiveProcessorGroupCount);
-       READ_SYMBOL(GetActiveProcessorCount);
-       READ_SYMBOL(SetThreadGroupAffinity);
-       READ_SYMBOL(GetProcessGroupAffinity);
-#  undef READ_SUMBOL
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif  /* _WIN32 */
index 9b9268f..bd1bc85 100644 (file)
 
 #include <windows.h>
 
-CCL_NAMESPACE_BEGIN
-
-#if _WIN32_WINNT < 0x0601
-typedef WORD tGetActiveProcessorGroupCount();
-typedef DWORD tGetActiveProcessorCount(WORD GroupNumber);
-typedef BOOL tSetThreadGroupAffinity(HANDLE hThread,
-                                     const GROUP_AFFINITY  *GroupAffinity,
-                                     PGROUP_AFFINITY PreviousGroupAffinity);
-typedef BOOL tGetProcessGroupAffinity(HANDLE  hProcess,
-                                     PUSHORT GroupCount,
-                                     PUSHORT GroupArray);
-
-extern tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
-extern tGetActiveProcessorCount *GetActiveProcessorCount;
-extern tSetThreadGroupAffinity *SetThreadGroupAffinity;
-extern tGetProcessGroupAffinity *GetProcessGroupAffinity;
-#endif
-
-/* Make sure NUMA and processor groups API is initialized. */
-void util_windows_init_numa_groups();
-
-CCL_NAMESPACE_END
-
-#endif  /* WIN32 */
+#endif  /* _WIN32 */
 
 #endif  /* __UTIL_WINDOWS_H__ */
index a4f32d8..7b5b50f 100644 (file)
@@ -67,7 +67,7 @@ int numaAPI_GetNumNodes(void);
 // Returns truth if the given node is available for compute.
 bool numaAPI_IsNodeAvailable(int node);
 
-// Getnumber of available processors on a given node.
+// Get number of available processors on a given node.
 int numaAPI_GetNumNodeProcessors(int node);
 
 ////////////////////////////////////////////////////////////////////////////////
index 559e97b..62e9dcd 100644 (file)
@@ -34,6 +34,8 @@
 #  include <dlfcn.h>
 #endif
 
+#include <stdio.h>
+
 #ifdef WITH_DYNLOAD
 
 // Descriptor numa library.
@@ -61,6 +63,7 @@ typedef struct bitmask* tnuma_allocate_nodemask(void);
 typedef void tnuma_free_cpumask(struct bitmask* bitmask);
 typedef void tnuma_free_nodemask(struct bitmask* bitmask);
 typedef int tnuma_run_on_node_mask(struct bitmask *nodemask);
+typedef int tnuma_run_on_node_mask_all(struct bitmask *nodemask);
 typedef void tnuma_set_interleave_mask(struct bitmask *nodemask);
 typedef void tnuma_set_localalloc(void);
 
@@ -83,6 +86,7 @@ static tnuma_allocate_nodemask* numa_allocate_nodemask;
 static tnuma_free_nodemask* numa_free_nodemask;
 static tnuma_free_cpumask* numa_free_cpumask;
 static tnuma_run_on_node_mask* numa_run_on_node_mask;
+static tnuma_run_on_node_mask_all* numa_run_on_node_mask_all;
 static tnuma_set_interleave_mask* numa_set_interleave_mask;
 static tnuma_set_localalloc* numa_set_localalloc;
 
@@ -157,6 +161,7 @@ static NUMAAPI_Result loadNumaSymbols(void) {
   NUMA_LIBRARY_FIND(numa_free_cpumask);
   NUMA_LIBRARY_FIND(numa_free_nodemask);
   NUMA_LIBRARY_FIND(numa_run_on_node_mask);
+  NUMA_LIBRARY_FIND(numa_run_on_node_mask_all);
   NUMA_LIBRARY_FIND(numa_set_interleave_mask);
   NUMA_LIBRARY_FIND(numa_set_localalloc);
 
@@ -192,10 +197,7 @@ int numaAPI_GetNumNodes(void) {
 }
 
 bool numaAPI_IsNodeAvailable(int node) {
-  if (numa_node_size(node, NULL) > 0) {
-    return true;
-  }
-  return false;
+  return numaAPI_GetNumNodeProcessors(node) > 0;
 }
 
 int numaAPI_GetNumNodeProcessors(int node) {
@@ -235,13 +237,15 @@ bool numaAPI_RunThreadOnNode(int node) {
   struct bitmask* node_mask = numa_allocate_nodemask();
   numa_bitmask_clearall(node_mask);
   numa_bitmask_setbit(node_mask, node);
-  numa_run_on_node_mask(node_mask);
+  numa_run_on_node_mask_all(node_mask);
   // TODO(sergey): The following commands are based on x265 code, we might want
   // to make those optional, or require to call those explicitly.
   //
   // Current assumption is that this is similar to SetThreadGroupAffinity().
-  numa_set_interleave_mask(node_mask);
-  numa_set_localalloc();
+  if (numa_node_size(node, NULL) > 0) {
+    numa_set_interleave_mask(node_mask);
+    numa_set_localalloc();
+  }
 #ifdef WITH_DYNLOAD
   if (numa_free_nodemask != NULL) {
     numa_free_nodemask(node_mask);
index 1b6061d..d796110 100644 (file)
@@ -244,7 +244,6 @@ VFont *BKE_vfont_load(Main *bmain, const char *filepath)
        char filename[FILE_MAXFILE];
        VFont *vfont = NULL;
        PackedFile *pf;
-       PackedFile *temp_pf = NULL;
        bool is_builtin;
 
        if (STREQ(filepath, FO_BUILTIN_NAME)) {
@@ -256,7 +255,6 @@ VFont *BKE_vfont_load(Main *bmain, const char *filepath)
        else {
                BLI_split_file_part(filepath, filename, sizeof(filename));
                pf = newPackedFile(NULL, filepath, BKE_main_blendfile_path(bmain));
-               temp_pf = newPackedFile(NULL, filepath, BKE_main_blendfile_path(bmain));
 
                is_builtin = false;
        }
@@ -282,7 +280,7 @@ VFont *BKE_vfont_load(Main *bmain, const char *filepath)
 
                        /* Do not add FO_BUILTIN_NAME to temporary listbase */
                        if (!STREQ(filename, FO_BUILTIN_NAME)) {
-                               vfont->temp_pf = temp_pf;
+                               vfont->temp_pf = newPackedFile(NULL, filepath, BKE_main_blendfile_path(bmain));
                        }
                }