Cycles: Add an AVX2 CPU kernel.
authorThomas Dinges <blender@dingto.org>
Fri, 13 Jun 2014 20:23:58 +0000 (22:23 +0200)
committerThomas Dinges <blender@dingto.org>
Fri, 13 Jun 2014 20:26:20 +0000 (22:26 +0200)
This kernel is compiled with AVX2, FMA3, and BMI compiler flags. At the moment only Intel Haswell benefits from this, but future AMD CPUs will have these instructions as well.

Makes rendering on Haswell CPUs a few percent faster, only benchmarked with clang on OS X though.

Part of my GSoC 2014.

intern/cycles/CMakeLists.txt
intern/cycles/SConscript
intern/cycles/device/device_cpu.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernel.h
intern/cycles/kernel/kernel_avx2.cpp [new file with mode: 0644]
intern/cycles/util/util_optimization.h
intern/cycles/util/util_system.cpp
intern/cycles/util/util_system.h

index a1b0030491e3540a275bb6ffbe6fcdab0e8ab6f1..5a6dc36b213660a0c896545e83ddbf4398dd2be6 100644 (file)
@@ -20,8 +20,10 @@ if(WIN32 AND MSVC)
        # /arch:AVX for VC2012 and above
        if(NOT MSVC_VERSION LESS 1700)
                set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX")
+               set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX /arch:AVX2")
        elseif(NOT CMAKE_CL_64)
                set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2")
+               set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2")
        endif()
 
        # there is no /arch:SSE3, but intrinsics are available anyway
@@ -30,11 +32,13 @@ if(WIN32 AND MSVC)
                set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
                set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
                set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+               set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
        else()
                set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
                set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
                set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
                set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+               set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
        endif()
 
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
@@ -48,6 +52,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
                set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
                set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
                set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
+               set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mbmi -mbmi2 -mfpmath=sse")
        endif()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -57,6 +62,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
                set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
                set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
                set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
+               set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 —mfma -mbmi -mbmi2")
        endif()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()
@@ -67,6 +73,7 @@ if(CXX_HAS_SSE)
                -DWITH_KERNEL_SSE3
                -DWITH_KERNEL_SSE41
                -DWITH_KERNEL_AVX
+               -DWITH_KERNEL_AVX2
        )
 endif()
 
index 542bb82cf2aa376fc576269bac5c4f16e0a71420..dab8f25de4a36a154b49a1e61987919ec9416448 100644 (file)
@@ -39,6 +39,7 @@ sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
 sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
 sources.remove(path.join('kernel', 'kernel_sse41.cpp'))
 sources.remove(path.join('kernel', 'kernel_avx.cpp'))
+sources.remove(path.join('kernel', 'kernel_avx2.cpp'))
 
 incs = [] 
 defs = []
@@ -98,6 +99,7 @@ elif env['OURPLATFORM'] == 'win64-vc':
     if env['MSVC_VERSION'] >= '12.0':
         kernel_flags['sse41'] = kernel_flags['sse3']
         kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX'
+        kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX /arch:AVX2'
 else:
     # -mavx only available with relatively new gcc/clang
     kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse'
@@ -106,6 +108,7 @@ else:
 
     if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'):
         kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx'
+        kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mbmi -mbmi2'
 
 for kernel_type in kernel_flags.keys():
     defs.append('WITH_KERNEL_' + kernel_type.upper())
index 71bf2d23d6ebc8dfc27deb971792c0b9d3357ae3..7308d036fe3ae669da3f050d3e093557182613b5 100644 (file)
@@ -62,6 +62,7 @@ public:
                system_cpu_support_sse3();
                system_cpu_support_sse41();
                system_cpu_support_avx();
+               system_cpu_support_avx2();
        }
 
        ~CPUDevice()
@@ -167,6 +168,28 @@ public:
                        int start_sample = tile.start_sample;
                        int end_sample = tile.start_sample + tile.num_samples;
 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+                       if(system_cpu_support_avx2()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state,
+                                                                                                         sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
                        if(system_cpu_support_avx()) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
@@ -293,6 +316,15 @@ public:
                float sample_scale = 1.0f/(task.sample + 1);
 
                if(task.rgba_half) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+                       if(system_cpu_support_avx2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                                                                                        sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
                        if(system_cpu_support_avx()) {
                                for(int y = task.y; y < task.y + task.h; y++)
@@ -337,6 +369,15 @@ public:
                        }
                }
                else {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+                       if(system_cpu_support_avx2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                                                                          sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
                        if(system_cpu_support_avx()) {
                                for(int y = task.y; y < task.y + task.h; y++)
@@ -390,6 +431,18 @@ public:
                OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+               if(system_cpu_support_avx2()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               for(int sample = 0; sample < task.num_samples; sample++)
+                                       kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+
+                               if(task.get_cancel() || task_pool.canceled())
+                                       break;
+                       }
+               }
+               else
+#endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
                if(system_cpu_support_avx()) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
index 62d56b465091f723530f56aba92e6e6cbc142b47..9896a55cf025e157e34d582ff64185e3109d1ca7 100644 (file)
@@ -214,12 +214,14 @@ if(CXX_HAS_SSE)
                kernel_sse3.cpp
                kernel_sse41.cpp
                kernel_avx.cpp
+               kernel_avx2.cpp
        )
 
        set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
        set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
        set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
        set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+       set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 
index b169b15b9b5ce760ab5be19d57df2c0309c4f805..264e5e3e4d053ba633b28d6af66b2cdba8f4512d 100644 (file)
@@ -87,6 +87,17 @@ void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
        int type, int i, int sample);
 #endif
 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+       int sample, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+       int type, int i, int sample);
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp
new file mode 100644 (file)
index 0000000..339421a
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#define __KERNEL_AVX2__
+#endif
+#include "util_optimization.h"
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+       if(kernel_data.integrator.branched)
+               kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+       else
+#endif
+               kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+       kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+       kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+{
+       if(type >= SHADER_EVAL_BAKE)
+               kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+       else
+               kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_avx2(void);
+void __dummy_function_cycles_avx2(void) {}
+
+#endif
index 0a6013cddd44fa6b2a273dc8d5e6e89ec1c7e458..5d0fea3476196373172f9724d712780251ddfb37 100644 (file)
 #define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 #endif
 
+#ifdef WITH_KERNEL_AVX2
+#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#endif
+
 /* MSVC 2008, no SSE41 (broken blendv intrinsic) and no AVX support */
 #if defined(_MSC_VER) && (_MSC_VER < 1700)
 #undef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #endif
 
 #endif
index 0764f7d93454195cacb3e2252cabc697d317704b..7c0445577e216c780dd5250ca5c7bb323cc512ba 100644 (file)
@@ -127,9 +127,12 @@ struct CPUCapabilities {
        bool sse42;
        bool sse4a;
        bool avx;
+       bool avx2;
        bool xop;
        bool fma3;
        bool fma4;
+       bool bmi1;
+       bool bmi2;
 };
 
 static CPUCapabilities& system_cpu_capabilities()
@@ -180,6 +183,11 @@ static CPUCapabilities& system_cpu_capabilities()
 #endif
                                caps.avx = (xcr_feature_mask & 0x6) == 0x6;
                        }
+
+                       __cpuid(result, 0x00000007);
+                       caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
+                       caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
+                       caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
                }
 
 #if 0
@@ -221,6 +229,11 @@ bool system_cpu_support_avx()
        CPUCapabilities& caps = system_cpu_capabilities();
        return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
 }
+bool system_cpu_support_avx2()
+{
+       CPUCapabilities& caps = system_cpu_capabilities();
+       return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+}
 #else
 
 bool system_cpu_support_sse2()
@@ -242,6 +255,10 @@ bool system_cpu_support_avx()
 {
        return false;
 }
+bool system_cpu_support_avx2()
+{
+       return false;
+}
 
 #endif
 
index 4409ea752cd3784f7c19aa6eae279d7e6af42ddb..0e8868c7dfc7f2c4729a74cdf9eddc5953322cf6 100644 (file)
@@ -28,6 +28,7 @@ bool system_cpu_support_sse2();
 bool system_cpu_support_sse3();
 bool system_cpu_support_sse41();
 bool system_cpu_support_avx();
+bool system_cpu_support_avx2();
 
 CCL_NAMESPACE_END