Fix #29259: cycles issues on certain processors. Now two versions of the kernel
authorBrecht Van Lommel <brechtvanlommel@pandora.be>
Tue, 15 Nov 2011 15:13:38 +0000 (15:13 +0000)
committerBrecht Van Lommel <brechtvanlommel@pandora.be>
Tue, 15 Nov 2011 15:13:38 +0000 (15:13 +0000)
are compiled, one SSE optimized and the other not, and it will choose between
them at runtime.

intern/cycles/CMakeLists.txt
intern/cycles/SConscript
intern/cycles/device/device_cpu.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernel.h
intern/cycles/kernel/kernel_optimized.cpp [new file with mode: 0644]
intern/cycles/util/util_system.cpp
intern/cycles/util/util_system.h

index d1ee5e0050dd8264b1216802de9e1f35dc714586..cfff7485e611de12762f211f91b84955e108af92 100644 (file)
@@ -9,31 +9,18 @@ include(cmake/external_libs.cmake)
 # Build Flags
 
 if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
-       set(GCC_OPTIM_FLAGS "-ffast-math -msse -msse2 -msse3")
-endif()
-
-if(APPLE)
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
-       set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-endif()
+       set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
 
-if(WIN32)
-       if(MSVC)
-               set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
-               set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+       if(WIN32 AND MSVC)
+               set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast")
        elseif(CMAKE_COMPILER_IS_GNUCC)
-               set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
-               set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+               set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -DGOGOGO")
        endif()
 endif()
 
-if(UNIX AND NOT APPLE)
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_OPTIM_FLAGS}")
-       set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-endif()
-
-# not needed yet, is for open shading language
-set(RTTI_DISABLE_FLAGS "")
+# for OSL, not needed yet
+# set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+# set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
 
 # Definitions and Includes
 
@@ -42,6 +29,10 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
 add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
 add_definitions(-DCCL_NAMESPACE_END=})
 
+if(WITH_CYCLES_OPTIMIZED_KERNEL)
+       add_definitions(-DWITH_OPTIMIZED_KERNEL)
+endif()
+
 if(WITH_CYCLES_NETWORK)
   add_definitions(-DWITH_NETWORK)
 endif()
index e2c81edea376044e53f6240c9106fb2ee4f48c41..1acb7321f0981646dce8cc44c325f6094094ce6a 100644 (file)
@@ -10,11 +10,10 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('
 
 sources.remove(path.join('util', 'util_view.cpp'))
 sources.remove(path.join('render', 'film_response.cpp'))
+sources.remove(path.join('kernel', 'kernel_optimized.cpp'))
 
 incs = [] 
 defs = []
-ccflags = []
-cxxflags = []
 
 defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
 defs.append('CCL_NAMESPACE_END=}')
@@ -23,14 +22,6 @@ defs.append('WITH_OPENCL')
 defs.append('WITH_MULTI')
 defs.append('WITH_CUDA')
 
-if env['OURPLATFORM'] in ('win32-mingw'):
-    if env['WITH_BF_RAYOPTIMIZATION']:
-        cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
-        ccflags.append('-ffast-math -msse -msse2 -msse3'.split())
-    # not needed yet, is for open shading language
-    # cxxflags.append('-fno-rtti'.split())
-    # defs.append('BOOST_NO_RTTI BOOST_NO_TYPEID'.split())
-
 incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split())
 incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna'.split())
 incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split())
@@ -39,5 +30,20 @@ incs.append(cycles['BF_OIIO_INC'])
 incs.append(cycles['BF_BOOST_INC'])
 incs.append(cycles['BF_PYTHON_INC'])
 
-cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None], cc_compileflags=ccflags, cxx_compileflags=cxxflags)
+# optimized kernel
+if env['WITH_BF_RAYOPTIMIZATION']:
+    optim_cxxflags = []
+
+    if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
+        optim_cxxflags.append('/Ox /Ot /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /EHsc /fp:fast'.split())
+    else:
+        optim_cxxflags.append('-ffast-math -msse -msse2 -msse3'.split())
+    
+    optim_defs = defs + ['WITH_OPTIMIZED_KERNEL']
+    optim_sources = [path.join('kernel', 'kernel_optimized.cpp')]
+
+    cycles_optim = cycles.Clone()
+    cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[0], compileflags=[None], cxx_compileflags=optim_cxxflags)
+
+cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], compileflags=[None])
 
index d6e1c20099695d7d61a3f17028e07e8ffb2a132b..990b7cb94b0a64d2909c73d89b3c3c7bb3a40fbb 100644 (file)
@@ -48,6 +48,9 @@ public:
        {
                kg = kernel_globals_create();
 
+               /* do now to avoid thread issues */
+               system_cpu_support_optimized();
+
                if(threads_num == 0)
                        threads_num = system_cpu_thread_count();
 
@@ -155,12 +158,26 @@ public:
                        OSLShader::thread_init(kg);
 #endif
 
-               for(int y = task.y; y < task.y + task.h; y++) {
-                       for(int x = task.x; x < task.x + task.w; x++)
-                               kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
+#ifdef WITH_OPTIMIZED_KERNEL
+               if(system_cpu_support_optimized()) {
+                       for(int y = task.y; y < task.y + task.h; y++) {
+                               for(int x = task.x; x < task.x + task.w; x++)
+                                       kernel_cpu_optimized_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
 
-                       if(tasks.worker_cancel())
-                               break;
+                               if(tasks.worker_cancel())
+                                       break;
+                       }
+               }
+               else
+#endif
+               {
+                       for(int y = task.y; y < task.y + task.h; y++) {
+                               for(int x = task.x; x < task.x + task.w; x++)
+                                       kernel_cpu_path_trace(kg, (float4*)task.buffer, (unsigned int*)task.rng_state, task.sample, x, y);
+
+                               if(tasks.worker_cancel())
+                                       break;
+                       }
                }
 
 #ifdef WITH_OSL
@@ -171,9 +188,18 @@ public:
 
        void thread_tonemap(DeviceTask& task)
        {
-               for(int y = task.y; y < task.y + task.h; y++) {
-                       for(int x = task.x; x < task.x + task.w; x++)
-                               kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
+#ifdef WITH_OPTIMIZED_KERNEL
+               if(system_cpu_support_optimized()) {
+                       for(int y = task.y; y < task.y + task.h; y++)
+                               for(int x = task.x; x < task.x + task.w; x++)
+                                       kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
+               }
+               else
+#endif
+               {
+                       for(int y = task.y; y < task.y + task.h; y++)
+                               for(int x = task.x; x < task.x + task.w; x++)
+                                       kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float4*)task.buffer, task.sample, task.resolution, x, y);
                }
        }
 
@@ -184,11 +210,24 @@ public:
                        OSLShader::thread_init(kg);
 #endif
 
-               for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
-                       kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
+#ifdef WITH_OPTIMIZED_KERNEL
+               if(system_cpu_support_optimized()) {
+                       for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
+                               kernel_cpu_optimized_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
+
+                               if(tasks.worker_cancel())
+                                       break;
+                       }
+               }
+               else
+#endif
+               {
+                       for(int x = task.displace_x; x < task.displace_x + task.displace_w; x++) {
+                               kernel_cpu_displace(kg, (uint4*)task.displace_input, (float3*)task.displace_offset, x);
 
-                       if(tasks.worker_cancel())
-                               break;
+                               if(tasks.worker_cancel())
+                                       break;
+                       }
                }
 
 #ifdef WITH_OSL
index 2bfb6c5812013b02246ce57ee0b5c619ff6ba24f..73425486be1f90cc675a007bcc86a9c3641e44bc 100644 (file)
@@ -8,6 +8,7 @@ set(INC
 
 set(SRC
        kernel.cpp
+       kernel_optimized.cpp
        kernel.cl
        kernel.cu
 )
@@ -123,11 +124,15 @@ include_directories(${INC})
 
 add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_SVM_HEADERS})
 
+if(WITH_CYCLES_OPTIMIZED_KERNEL)
+       SET_SOURCE_FILES_PROPERTIES(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS ${CYCLES_OPTIMIZED_KERNEL_FLAGS})
+endif()
+
 if(WITH_CYCLES_CUDA)
        add_dependencies(cycles_kernel cycles_kernel_cuda)
 endif()
 
-# OPENCL kernel
+# OpenCL kernel
 
 #set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
 #add_custom_command(
@@ -142,3 +147,4 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+
index 7f60730e8bf97aea84432a94f7fe42f372f9ede3..700ee49c5f2f98cc33dccf3683dd00a57bf4c585 100644 (file)
@@ -38,9 +38,14 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 
 void kernel_cpu_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
 void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
-
 void kernel_cpu_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
 
+#ifdef WITH_OPTIMIZED_KERNEL
+void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y);
+void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y);
+void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i);
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/kernel_optimized.cpp b/intern/cycles/kernel/kernel_optimized.cpp
new file mode 100644 (file)
index 0000000..85a2b79
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3
+   optimization flags and nearly all functions inlined, while kernel.cpp
+   is compiled without for other CPU's. */
+
+#ifdef WITH_OPTIMIZED_KERNEL
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_displace.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float4 *buffer, unsigned int *rng_state, int sample, int x, int y)
+{
+       kernel_path_trace(kg, buffer, rng_state, sample, x, y);
+}
+
+/* Tonemapping */
+
+void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float4 *buffer, int sample, int resolution, int x, int y)
+{
+       kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y);
+}
+
+/* Displacement */
+
+void kernel_cpu_optimized_displace(KernelGlobals *kg, uint4 *input, float3 *offset, int i)
+{
+       kernel_displace(kg, input, offset, i);
+}
+
+CCL_NAMESPACE_END
+
+#endif
+
index 8b09f227a745e7c1574ff5935daa00a4aa45aeb0..abf5e08de9700e44ba12a95db6212441a44eed74 100644 (file)
@@ -118,5 +118,78 @@ int system_cpu_bits()
        return (sizeof(void*)*8);
 }
 
+#if defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)
+
+struct CPUCapabilities {
+       bool x64;
+       bool mmx;
+       bool sse;
+       bool sse2;
+       bool sse3;
+       bool ssse3;
+       bool sse41;
+       bool sse42;
+       bool sse4a;
+       bool avx;
+       bool xop;
+       bool fma3;
+       bool fma4;
+};
+
+bool system_cpu_support_optimized()
+{
+       static CPUCapabilities caps;
+       static bool caps_init = false;
+
+       if(!caps_init) {
+               int result[4], num, num_ex;
+
+               memset(&caps, 0, sizeof(caps));
+
+               __cpuid(result, 0);
+               num = result[0];
+
+               __cpuid(result, 0x80000000);
+               num_ex = result[0];
+
+               if(num >= 1){
+                       __cpuid(result, 0x00000001);
+                       caps.mmx = (result[3] & ((int)1 << 23)) != 0;
+                       caps.sse = (result[3] & ((int)1 << 25)) != 0;
+                       caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
+                       caps.sse3 = (result[2] & ((int)1 <<  0)) != 0;
+
+                       caps.ssse3 = (result[2] & ((int)1 <<  9)) != 0;
+                       caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
+                       caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
+
+                       caps.avx = (result[2] & ((int)1 << 28)) != 0;
+                       caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
+               }
+
+               /*if(num_ex >= 0x80000001){
+                       __cpuid(result, 0x80000001);
+                       caps.x64 = (result[3] & ((int)1 << 29)) != 0;
+                       caps.sse4a = (result[2] & ((int)1 <<  6)) != 0;
+                       caps.fma4 = (result[2] & ((int)1 << 16)) != 0;
+                       caps.xop = (result[2] & ((int)1 << 11)) != 0;
+               }*/
+
+               caps_init = true;
+       }
+
+       /* optimization flags use these */
+       return caps.sse && caps.sse2 && caps.sse3;
+}
+
+#else
+
+bool system_cpu_support_optimized()
+{
+       return false;
+}
+
+#endif
+
 CCL_NAMESPACE_END
 
index 214b3a18ca3ec10747b3b724e9847a69ef76f5f8..f25e009a250fb395e2b5e0f83f0a186047984f87 100644 (file)
@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
 int system_cpu_thread_count();
 string system_cpu_brand_string();
 int system_cpu_bits();
+bool system_cpu_support_optimized();
 
 CCL_NAMESPACE_END