Fix cycles intersection issue with overlapping faces on windows 32 bit and CPU
authorBrecht Van Lommel <brechtvanlommel@pandora.be>
Mon, 4 Feb 2013 16:12:37 +0000 (16:12 +0000)
committerBrecht Van Lommel <brechtvanlommel@pandora.be>
Mon, 4 Feb 2013 16:12:37 +0000 (16:12 +0000)
without SSE3 support, due to 80 bit precision float register being used for one
bounding box but not the one next to it.

intern/cycles/CMakeLists.txt
intern/cycles/SConscript
intern/cycles/device/device_cpu.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernel.h
intern/cycles/kernel/kernel_bvh.h
intern/cycles/kernel/kernel_sse2.cpp [moved from intern/cycles/kernel/kernel_optimized.cpp with 79% similarity]
intern/cycles/kernel/kernel_sse3.cpp [new file with mode: 0644]
intern/cycles/util/util_system.cpp
intern/cycles/util/util_system.h

index 535239a92059c8b27ad8c865c500f5011a5f0e30..226218ae512f4bcdd5aaa9f8c8c3e82731a55f88 100644 (file)
@@ -13,10 +13,12 @@ if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
 endif()
 
 if(WIN32 AND MSVC)
-       set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
+       set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
+       set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /EHsc")
 elseif(CMAKE_COMPILER_IS_GNUCC)
-       set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse")
+       set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
+       set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()
 
index 19af7dede9faafbf2ffa4b004967bfe0adcef4b5..8a8ef9cce396689ebeb923b70c7ec396e720dd8c 100644 (file)
@@ -36,7 +36,8 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob('
 
 sources.remove(path.join('util', 'util_view.cpp'))
 sources.remove(path.join('render', 'film_response.cpp'))
-sources.remove(path.join('kernel', 'kernel_optimized.cpp'))
+sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
+sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
 
 incs = [] 
 defs = []
@@ -73,21 +74,29 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', '
 
 # optimized kernel
 if env['WITH_BF_RAYOPTIMIZATION']:
-    optim_cxxflags = Split(env['CXXFLAGS'])
+    sse2_cxxflags = Split(env['CXXFLAGS'])
+    sse3_cxxflags = Split(env['CXXFLAGS'])
 
     if env['OURPLATFORM'] == 'win32-vc':
-        optim_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+        sse2_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+        sse3_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
     elif env['OURPLATFORM'] == 'win64-vc':
-        optim_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+        sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+        sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
     else:
-        optim_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split())
+        sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
+        sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split())
     
     defs.append('WITH_OPTIMIZED_KERNEL')
     optim_defs = defs[:]
-    optim_sources = [path.join('kernel', 'kernel_optimized.cpp')]
 
-    cycles_optim = cycles.Clone()
-    cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=optim_cxxflags)
+    cycles_sse3 = cycles.Clone()
+    sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
+    cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
+
+    cycles_sse2 = cycles.Clone()
+    sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
+    cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
 
 cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)
 
index a1d7706a34e41b2e2b95f1800914d4c3064c5387..1915245bb55db92b99a9120e265791e58786decb 100644 (file)
@@ -58,7 +58,8 @@ public:
 #endif
 
                /* do now to avoid thread issues */
-               system_cpu_support_optimized();
+               system_cpu_support_sse2();
+               system_cpu_support_sse3();
        }
 
        ~CPUDevice()
@@ -170,7 +171,7 @@ public:
                        int end_sample = tile.start_sample + tile.num_samples;
 
 #ifdef WITH_OPTIMIZED_KERNEL
-                       if(system_cpu_support_optimized()) {
+                       if(system_cpu_support_sse2()) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
                                        if (task.get_cancel() || task_pool.cancelled()) {
                                                if(task.need_finish_queue == false)
@@ -179,7 +180,26 @@ public:
 
                                        for(int y = tile.y; y < tile.y + tile.h; y++) {
                                                for(int x = tile.x; x < tile.x + tile.w; x++) {
-                                                       kernel_cpu_optimized_path_trace(&kg, render_buffer, rng_state,
+                                                       kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
+                                                               sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else if(system_cpu_support_sse3()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.cancelled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
                                                                sample, x, y, tile.offset, tile.stride);
                                                }
                                        }
@@ -227,10 +247,16 @@ public:
        void thread_tonemap(DeviceTask& task)
        {
 #ifdef WITH_OPTIMIZED_KERNEL
-               if(system_cpu_support_optimized()) {
+               if(system_cpu_support_sse2()) {
+                       for(int y = task.y; y < task.y + task.h; y++)
+                               for(int x = task.x; x < task.x + task.w; x++)
+                                       kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
+                                               task.sample, task.resolution, x, y, task.offset, task.stride);
+               }
+               else if(system_cpu_support_sse3()) {
                        for(int y = task.y; y < task.y + task.h; y++)
                                for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_optimized_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
+                                       kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
                                                task.sample, task.resolution, x, y, task.offset, task.stride);
                }
                else
@@ -252,9 +278,17 @@ public:
 #endif
 
 #ifdef WITH_OPTIMIZED_KERNEL
-               if(system_cpu_support_optimized()) {
+               if(system_cpu_support_sse2()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+                               if(task_pool.cancelled())
+                                       break;
+                       }
+               }
+               else if(system_cpu_support_sse3()) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-                               kernel_cpu_optimized_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+                               kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
                                if(task_pool.cancelled())
                                        break;
index 6d5b9a063a089ee44d59e19fa4b40a132e071de9..e83756b7c8a648f80e3d0ebaae49f0b496f767c5 100644 (file)
@@ -12,7 +12,8 @@ set(INC_SYS
 
 set(SRC
        kernel.cpp
-       kernel_optimized.cpp
+       kernel_sse2.cpp
+       kernel_sse3.cpp
        kernel.cl
        kernel.cu
 )
@@ -149,7 +150,8 @@ include_directories(SYSTEM ${INC_SYS})
 add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS})
 
 if(WITH_CYCLES_OPTIMIZED_KERNEL)
-       set_source_files_properties(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_OPTIMIZED_KERNEL_FLAGS}")
+       set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+       set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 endif()
 
 if(WITH_CYCLES_CUDA)
index 26c0bcd6d1a531d8b32af904d581db1e4284fe55..20ea5a61906113297e7e29917b4d1b394b236a58 100644 (file)
@@ -44,11 +44,18 @@ void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
        int type, int i);
 
 #ifdef WITH_OPTIMIZED_KERNEL
-void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
        int sample, int x, int y, int offset, int stride);
-void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
        int sample, int resolution, int x, int y, int offset, int stride);
-void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+       int type, int i);
+
+void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+       int sample, int x, int y, int offset, int stride);
+void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       int sample, int resolution, int x, int y, int offset, int stride);
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
        int type, int i);
 #endif
 
index 1a85b5bbefd105c66aa5157bae6fdd558346061a..2b9ebf35d0cfc935d5aa6ad83742ce894832dea2 100644 (file)
@@ -126,21 +126,21 @@ __device_inline void bvh_node_intersect(KernelGlobals *kg,
 
        /* intersect ray against child nodes */
        float3 ood = P * idir;
-       float c0lox = n0xy.x * idir.x - ood.x;
-       float c0hix = n0xy.y * idir.x - ood.x;
-       float c0loy = n0xy.z * idir.y - ood.y;
-       float c0hiy = n0xy.w * idir.y - ood.y;
-       float c0loz = nz.x * idir.z - ood.z;
-       float c0hiz = nz.y * idir.z - ood.z;
+       NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
+       NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
+       NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
+       NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;
+       NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z;
+       NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z;
        NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
        NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
 
-       float c1loz = nz.z * idir.z - ood.z;
-       float c1hiz = nz.w * idir.z - ood.z;
-       float c1lox = n1xy.x * idir.x - ood.x;
-       float c1hix = n1xy.y * idir.x - ood.x;
-       float c1loy = n1xy.z * idir.y - ood.y;
-       float c1hiy = n1xy.w * idir.y - ood.y;
+       NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z;
+       NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z;
+       NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x;
+       NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x;
+       NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y;
+       NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y;
        NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
        NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
 
similarity index 79%
rename from intern/cycles/kernel/kernel_optimized.cpp
rename to intern/cycles/kernel/kernel_sse2.cpp
index 0b662095133c1e1fcd847e13b17d61bfa280aa86..7947107a43c26a86c5f7219e811bf0299fa7a54b 100644 (file)
@@ -35,21 +35,21 @@ CCL_NAMESPACE_BEGIN
 
 /* Path Tracing */
 
-void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
 {
        kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
 /* Tonemapping */
 
-void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride)
+void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride)
 {
        kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride);
 }
 
 /* Shader Evaluate */
 
-void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
 {
        kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
 }
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
new file mode 100644 (file)
index 0000000..9a8b389
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#ifdef WITH_OPTIMIZED_KERNEL
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_displace.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+       kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Tonemapping */
+
+void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride)
+{
+       kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+{
+       kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+}
+
+CCL_NAMESPACE_END
+
+#endif
+
index 2d9f0fffae66a853e3bcbd059dd9673df8573741..4fda090e09ebdb3e955121e5ec7f8aed489225d5 100644 (file)
@@ -136,7 +136,7 @@ struct CPUCapabilities {
        bool fma4;
 };
 
-bool system_cpu_support_optimized()
+static CPUCapabilities& system_cpu_capabilities()
 {
        static CPUCapabilities caps;
        static bool caps_init = false;
@@ -182,7 +182,18 @@ bool system_cpu_support_optimized()
                caps_init = true;
        }
 
-       /* optimization flags use these */
+       return caps;
+}
+
+bool system_cpu_support_sse2()
+{
+       CPUCapabilities& caps = system_cpu_capabilities();
+       return caps.sse && caps.sse2;
+}
+
+bool system_cpu_support_sse3()
+{
+       CPUCapabilities& caps = system_cpu_capabilities();
        return caps.sse && caps.sse2 && caps.sse3;
 }
 
index f25e009a250fb395e2b5e0f83f0a186047984f87..257112883d1aa0da5a88cf4ead7a041b21897fd4 100644 (file)
@@ -26,7 +26,8 @@ CCL_NAMESPACE_BEGIN
 int system_cpu_thread_count();
 string system_cpu_brand_string();
 int system_cpu_bits();
-bool system_cpu_support_optimized();
+bool system_cpu_support_sse2();
+bool system_cpu_support_sse3();
 
 CCL_NAMESPACE_END