Merge branch 'master' into blender2.8
authorBastien Montagne <montagne29@wanadoo.fr>
Mon, 7 Aug 2017 14:16:43 +0000 (16:16 +0200)
committerBastien Montagne <montagne29@wanadoo.fr>
Mon, 7 Aug 2017 14:16:43 +0000 (16:16 +0200)
44 files changed:
intern/cycles/blender/addon/presets.py
intern/cycles/blender/addon/properties.py
intern/cycles/blender/addon/ui.py
intern/cycles/blender/blender_sync.cpp
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/opencl/opencl_util.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/filter/filter_features_sse.h
intern/cycles/kernel/filter/filter_nlm_cpu.h
intern/cycles/kernel/filter/filter_prefilter.h
intern/cycles/kernel/filter/filter_transform_sse.h
intern/cycles/kernel/geom/geom_object.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_state.h
intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
intern/cycles/kernel/kernels/cuda/kernel_config.h
intern/cycles/kernel/kernels/cuda/kernel_split.cu
intern/cycles/kernel/split/kernel_shader_sort.h
intern/cycles/render/image.cpp
intern/cycles/render/integrator.cpp
intern/cycles/render/integrator.h
intern/cycles/render/light.cpp
intern/cycles/render/session.cpp
intern/cycles/render/shader.cpp
intern/cycles/util/CMakeLists.txt
intern/cycles/util/util_defines.h [new file with mode: 0644]
intern/cycles/util/util_math.h
intern/cycles/util/util_math_float3.h
intern/cycles/util/util_math_float4.h
intern/cycles/util/util_math_matrix.h
intern/cycles/util/util_optimization.h
intern/cycles/util/util_simd.h
intern/cycles/util/util_sseb.h
intern/cycles/util/util_ssef.h
intern/cycles/util/util_ssei.h
intern/cycles/util/util_types.h
source/blender/editors/space_clip/space_clip.c
source/blender/editors/space_sequencer/space_sequencer.c
source/blender/windowmanager/intern/wm_files.c
tests/python/CMakeLists.txt
tests/python/cycles_render_tests.py

index 440221b847098d0886b26a88aa3d323d91a84716..17efb00abdb85873fb6387e7d5cf6d7ccf9ececf 100644 (file)
@@ -37,7 +37,6 @@ class AddPresetIntegrator(AddPresetBase, Operator):
         "cycles.transmission_bounces",
         "cycles.volume_bounces",
         "cycles.transparent_max_bounces",
-        "cycles.use_transparent_shadows",
         "cycles.caustics_reflective",
         "cycles.caustics_refractive",
         "cycles.blur_glossy"
index cfffe5362ca5f7409fdc2c5479220b293d325a3f..93b90ec650b6b77a279649d33746ba26f39b6322 100644 (file)
@@ -343,11 +343,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 min=0, max=1024,
                 default=8,
                 )
-        cls.use_transparent_shadows = BoolProperty(
-                name="Transparent Shadows",
-                description="Use transparency of surfaces for rendering shadows",
-                default=True,
-                )
 
         cls.volume_step_size = FloatProperty(
                 name="Step Size",
index 59ee053efb99755262f7b3ef3e94bf94696bf7f4..983f817539b467fdc87d790f2adfcf7d8a8c8f05 100644 (file)
@@ -293,7 +293,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub = col.column(align=True)
         sub.label("Transparency:")
         sub.prop(cscene, "transparent_max_bounces", text="Max")
-        sub.prop(cscene, "use_transparent_shadows", text="Shadows")
 
         col.separator()
 
index 15ad4ff301c175b4694a77d2554894826b2c7b9b..adbabaccdc1c3246726fbdcd4475fc76eec4aaed 100644 (file)
@@ -245,7 +245,6 @@ void BlenderSync::sync_integrator()
        integrator->max_volume_bounce = get_int(cscene, "volume_bounces");
 
        integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces");
-       integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows");
 
        integrator->volume_max_steps = get_int(cscene, "volume_max_steps");
        integrator->volume_step_size = get_float(cscene, "volume_step_size");
index 18112437b458f1deb6b5d742684531e47cb65f96..a00be3eeaab773b7d19e8f1eaf35745f88d4a583 100644 (file)
@@ -48,6 +48,7 @@
 #include "util/util_logging.h"
 #include "util/util_map.h"
 #include "util/util_opengl.h"
+#include "util/util_optimization.h"
 #include "util/util_progress.h"
 #include "util/util_system.h"
 #include "util/util_thread.h"
index e53aec0fbb97a61c1fd04a45bc0e1bce665b1906..f13506c89603dc8ab30459a3180328a7641bfab8 100644 (file)
@@ -1919,17 +1919,13 @@ public:
                int threads_per_block;
                cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
 
-               int xthreads = (int)sqrt(threads_per_block);
-               int ythreads = (int)sqrt(threads_per_block);
-
-               int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
-               int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+               int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
 
                cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
 
                cuda_assert(cuLaunchKernel(func,
-                                          xblocks , yblocks, 1, /* blocks */
-                                          xthreads, ythreads, 1, /* threads */
+                                          xblocks, 1, 1, /* blocks */
+                                          threads_per_block, 1, 1, /* threads */
                                           0, 0, args, 0));
 
                device->cuda_pop_context();
index 0d34af3e04002f115414f2dc39bd239a0b77e123..7d5173a5f1ddbec78254063b79bcca78cb5e3a26 100644 (file)
@@ -635,7 +635,7 @@ bool OpenCLInfo::device_supported(const string& platform_name,
                        "Tahiti", "Pitcairn", "Capeverde", "Oland",
                        NULL
                };
-               for (int i = 0; blacklist[i] != NULL; i++) {
+               for(int i = 0; blacklist[i] != NULL; i++) {
                        if(device_name == blacklist[i]) {
                                VLOG(1) << "AMD device " << device_name << " not supported";
                                return false;
index 23e9bd311c45eb3704dc00ff937defc833c85052..88c4c4e3282bb8ddc64221de5fec2b4aa6609511 100644 (file)
@@ -233,6 +233,7 @@ set(SRC_FILTER_HEADERS
 set(SRC_UTIL_HEADERS
        ../util/util_atomic.h
        ../util/util_color.h
+       ../util/util_defines.h
        ../util/util_half.h
        ../util/util_hash.h
        ../util/util_math.h
index 3185330994c69557718219ed1646c77775a78150..3ddd871226647e7b01cfbd2eb81ebf4e60453f8f 100644 (file)
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
@@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN
 
 #define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
                                  for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                     __m128 y4 = _mm_set1_ps(pixel.y); \
+                                     float4 y4 = make_float4(pixel.y); \
                                      for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
-                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                         int4 active_pixels = x4 < make_float4(high.x);
 
 #define END_FOR_PIXEL_WINDOW_SSE     } \
                                      pixel_buffer += buffer_w - (pixel.x - low.x); \
                                  }
 
-ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
-                                               __m128 active_pixels,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+                                               int4 active_pixels,
                                                const float *ccl_restrict buffer,
-                                               __m128 *features,
-                                               const __m128 *ccl_restrict mean,
+                                               float4 *features,
+                                               const float4 *ccl_restrict mean,
                                                int pass_stride)
 {
        features[0] = x;
        features[1] = y;
-       features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+       features[2] = fabs(ccl_get_feature_sse(0));
        features[3] = ccl_get_feature_sse(1);
        features[4] = ccl_get_feature_sse(2);
        features[5] = ccl_get_feature_sse(3);
@@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
        features[9] = ccl_get_feature_sse(7);
        if(mean) {
                for(int i = 0; i < DENOISE_FEATURES; i++)
-                       features[i] = _mm_sub_ps(features[i], mean[i]);
+                       features[i] = features[i] - mean[i];
        }
        for(int i = 0; i < DENOISE_FEATURES; i++)
-               features[i] = _mm_mask_ps(features[i], active_pixels);
+               features[i] = mask(active_pixels, features[i]);
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
-                                                     __m128 active_pixels,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+                                                     int4 active_pixels,
                                                      const float *ccl_restrict buffer,
-                                                     __m128 *scales,
-                                                     const __m128 *ccl_restrict mean,
+                                                     float4 *scales,
+                                                     const float4 *ccl_restrict mean,
                                                      int pass_stride)
 {
-       scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
-       scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
-
-       scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
-
-       __m128 diff, scale;
-       diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
-       scale = _mm_mul_ps(diff, diff);
-       diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
-       scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-       diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
-       scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-       scales[3] = _mm_mask_ps(scale, active_pixels);
-
-       scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
-
-       diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
-       scale = _mm_mul_ps(diff, diff);
-       diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
-       scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-       diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
-       scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-       scales[5] = _mm_mask_ps(scale, active_pixels);
+       scales[0] = fabs(x - mean[0]);
+       scales[1] = fabs(y - mean[1]);
+       scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+       scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+                   sqr(ccl_get_feature_sse(2) - mean[4]) +
+                   sqr(ccl_get_feature_sse(3) - mean[5]);
+       scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+       scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+                   sqr(ccl_get_feature_sse(6) - mean[8]) +
+                   sqr(ccl_get_feature_sse(7) - mean[9]);
+       for(int i = 0; i < 6; i++)
+               scales[i] = mask(active_pixels, scales[i]);
 }
 
-ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
 {
-       scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
-       scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
-       scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
-       scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
-
-       scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
-       scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+       scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+       scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+       scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+       scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+       scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+       scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
 }
 
 
index 3e752bce68f01afd5bae510ad0a0b707aaee517c..5e989331bc2cacdca36f6f3db50a3f1b005ea897 100644 (file)
@@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                                               int w,
                                               int f)
 {
-#ifdef __KERNEL_SSE3__
-       int aligned_lowx = (rect.x & ~(3));
-       int aligned_highx = ((rect.z + 3) & ~(3));
-#endif
+       int aligned_lowx = rect.x / 4;
+       int aligned_highx = (rect.z + 3) / 4;
        for(int y = rect.y; y < rect.w; y++) {
                const int low = max(rect.y, y-f);
                const int high = min(rect.w, y+f+1);
@@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                        out_image[y*w+x] = 0.0f;
                }
                for(int y1 = low; y1 < high; y1++) {
-#ifdef __KERNEL_SSE3__
-                       for(int x = aligned_lowx; x < aligned_highx; x+=4) {
-                               _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+                       float4* out_image4 = (float4*)(out_image + y*w);
+                       float4* difference_image4 = (float4*)(difference_image + y1*w);
+                       for(int x = aligned_lowx; x < aligned_highx; x++) {
+                               out_image4[x] += difference_image4[x];
                        }
-#else
-                       for(int x = rect.x; x < rect.z; x++) {
-                               out_image[y*w+x] += difference_image[y1*w+x];
-                       }
-#endif
                }
                for(int x = rect.x; x < rect.z; x++) {
                        out_image[y*w+x] *= 1.0f/(high - low);
index a0b89c1111fcb3b473db80ae67b8f5060bae12f7..c6a70cbeab5ed27799109754931b8ce1b3a371ca 100644 (file)
@@ -96,7 +96,7 @@ ccl_device void kernel_filter_get_feature(int sample,
        int idx = (y-rect.y)*buffer_w + (x - rect.x);
 
        mean[idx] = center_buffer[m_offset] / sample;
-       if (sample > 1) {
+       if(sample > 1) {
                if(use_split_variance) {
                        variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
                }
index 30dc2969b114f40fd3c41a360e9a5ad0ab4efaa9..9e65f61664b6544de50abec160084e6fa049f020 100644 (file)
@@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 {
        int buffer_w = align_up(rect.z - rect.x, 4);
 
-       __m128 features[DENOISE_FEATURES];
+       float4 features[DENOISE_FEATURES];
        const float *ccl_restrict pixel_buffer;
        int2 pixel;
 
@@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
                              min(rect.w, y + radius + 1));
        int num_pixels = (high.y - low.y) * (high.x - low.x);
 
-       __m128 feature_means[DENOISE_FEATURES];
+       float4 feature_means[DENOISE_FEATURES];
        math_vector_zero_sse(feature_means, DENOISE_FEATURES);
        FOR_PIXEL_WINDOW_SSE {
                filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
                math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
        } END_FOR_PIXEL_WINDOW_SSE
 
-       __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+       float4 pixel_scale = make_float4(1.0f / num_pixels);
        for(int i = 0; i < DENOISE_FEATURES; i++) {
-               feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+               feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
        }
 
-       __m128 feature_scale[DENOISE_FEATURES];
+       float4 feature_scale[DENOISE_FEATURES];
        math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
        FOR_PIXEL_WINDOW_SSE {
                filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
@@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        filter_calculate_scale_sse(feature_scale);
 
-       __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+       float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
        math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
        FOR_PIXEL_WINDOW_SSE {
                filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
                math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
-               math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+               math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
        } END_FOR_PIXEL_WINDOW_SSE
 
        float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
@@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        /* Bake the feature scaling into the transformation matrix. */
        for(int i = 0; i < DENOISE_FEATURES; i++) {
-               math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+               math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
        }
 }
 
index 6ecdfe0173a0806cceec881fd1bffd97ac6db186..1ffc143be34dfba186c4b1283c3d5d63535fd0ca 100644 (file)
@@ -415,12 +415,7 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
-       /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
        return rcp(dir);
-#else
-       return 1.0f / dir;
-#endif
 }
 
 /* Transform ray into object space to enter static object in BVH */
index c340b3bc96859096b6cc1844436c8884185418bb..8f6c2b07381ba5762b835d07c10d5cb10338c5c1 100644 (file)
@@ -100,6 +100,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 #ifndef __SPLIT_KERNEL__
 
+#if defined(__BRANCHED_PATH__) || defined(__BAKING__)
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
@@ -428,6 +430,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
        }
 }
 
+#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
 
 ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
                                               RNG *rng,
index a96ffe0771885b13112dcdc2fe3db9f67562f20b..3ce183bf67a34be4fd0609c8316712be28ca66fa 100644 (file)
@@ -173,7 +173,7 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_
                }
 #ifdef __SHADOW_TRICKS__
                /* Exception for shadow catcher not working correctly with RR. */
-               else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+               else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
                        return 1.0f;
                }
 #endif
@@ -196,7 +196,7 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_
                }
 #ifdef __SHADOW_TRICKS__
                /* Exception for shadow catcher not working correctly with RR. */
-               else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
+               else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
                        return 1.0f;
                }
 #endif
index f7c9935f1d078c518ee7d0c92c452509b7f2bd6f..a13fb5cd4fb18d6d889fb953859b72817df6e8d5 100644 (file)
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
index 070b95a35053942cffc04a2ca4b3e77386ac549b..6b690adf0f50a6c2f2f40cba81cfb9d6fc012687 100644 (file)
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
index 1a7b2040da1f8ccf67180d49c9b1bc830f268234..254025be4e26fff4adc9fb9bf140106963383b3a 100644 (file)
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
index 9fa39dc9ebbd753b88b88509d2cf4a2e285e9259..7ae205b7e14469f94cc11ae842bce15241ffe8b6 100644 (file)
 #  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
 #endif
 
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
+/* For split kernel using all registers seems fastest for now, but this
+ * is unlikely to be optimal once we resolve other bottlenecks. */
+
+#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
 
 #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
        __launch_bounds__( \
index 628891b1458b602f39987ea5e44191bfd1e20570..e97e87285a5179c4ee27b7ebba1daeec7262dadb 100644 (file)
@@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init(
 
 #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
        extern "C" __global__ void \
-       CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+       CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
        kernel_cuda_##name() \
        { \
                kernel_##name(NULL); \
@@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init(
 
 #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
        extern "C" __global__ void \
-       CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+       CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
        kernel_cuda_##name() \
        { \
                ccl_local type locals; \
index 297decb0bc2eed866debbb55bdebf193e6b3be19..5a55b680695463b52c5ddac59fe57d559e1cb8e7 100644 (file)
@@ -39,7 +39,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
        ccl_local ushort *local_index = &locals->local_index[0];
 
        /* copy to local memory */
-       for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+       for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
                uint idx = offset + i + lid;
                uint add = input + idx;
                uint value = (~0);
@@ -59,9 +59,9 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 #  ifdef __KERNEL_OPENCL__
 
        /* bitonic sort */
-       for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
-               for (uint inc = length; inc > 0; inc >>= 1) {
-                       for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+       for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+               for(uint inc = length; inc > 0; inc >>= 1) {
+                       for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
                                uint i = lid + ii;
                                bool direction = ((i & (length << 1)) != 0);
                                uint j = i ^ inc;
@@ -81,7 +81,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 #  endif /* __KERNEL_OPENCL__ */
 
        /* copy to destination */
-       for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+       for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
                uint idx = offset + i + lid;
                uint lidx = local_index[i + lid];
                uint outi = output + idx;
index 02b65440154424fbe3b31ed0b0f5d5cf251b1272..a490f10aee463732e82d4adb5556091ac6bb9a90 100644 (file)
@@ -344,7 +344,7 @@ int ImageManager::add_image(const string& filename,
        else {
                /* Very unlikely, since max_num_images is insanely big. But better safe than sorry. */
                int tex_count = 0;
-               for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+               for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
                        tex_count += tex_num_images[type];
                }
                if(tex_count > max_num_images) {
index b9b8c681a262daaf2e0b2eaaf0b20ea4d9b5768a..15b728d6e027c5366c8466f16d91757006d03178 100644 (file)
@@ -39,7 +39,6 @@ NODE_DEFINE(Integrator)
        SOCKET_INT(max_volume_bounce, "Max Volume Bounce", 7);
 
        SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
-       SOCKET_BOOLEAN(transparent_shadows, "Transparent Shadows", false);
 
        SOCKET_INT(ao_bounces, "AO Bounces", 0);
 
@@ -121,19 +120,14 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
         * We only need to enable transparent shadows, if we actually have 
         * transparent shaders in the scene. Otherwise we can disable it
         * to improve performance a bit. */
-       if(transparent_shadows) {
-               kintegrator->transparent_shadows = false;
-               foreach(Shader *shader, scene->shaders) {
-                       /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
-                       if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
-                               kintegrator->transparent_shadows = true;
-                               break;
-                       }
+       kintegrator->transparent_shadows = false;
+       foreach(Shader *shader, scene->shaders) {
+               /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
+               if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
+                       kintegrator->transparent_shadows = true;
+                       break;
                }
        }
-       else {
-               kintegrator->transparent_shadows = false;
-       }
 
        kintegrator->volume_max_steps = volume_max_steps;
        kintegrator->volume_step_size = volume_step_size;
index ce5651ec823b6ee149cdfb4b57fcbc78fabb6352..3cb430d72b40bb03aa6d18dd6a553bd075076e9d 100644 (file)
@@ -39,7 +39,6 @@ public:
        int max_volume_bounce;
 
        int transparent_max_bounce;
-       bool transparent_shadows;
 
        int ao_bounces;
 
index 93d88c5642c74a325766cb40cd114f153b9ce245..371ea54ef11c77f672b3dd23d51d31acfc6cf8b1 100644 (file)
@@ -225,7 +225,7 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
 bool LightManager::object_usable_as_light(Object *object) {
        Mesh *mesh = object->mesh;
        /* Skip objects with NaNs */
-       if (!object->bounds.valid()) {
+       if(!object->bounds.valid()) {
                return false;
        }
        /* Skip if we are not visible for BSDFs. */
index 8622318858e61538fa6373fca4ccc56f019408d8..ca3aefcb5e60d700879ec73ff07507d7be0e57bd 100644 (file)
@@ -721,7 +721,6 @@ DeviceRequestedFeatures Session::get_requested_device_features()
        BakeManager *bake_manager = scene->bake_manager;
        requested_features.use_baking = bake_manager->get_baking();
        requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
-       requested_features.use_transparent &= scene->integrator->transparent_shadows;
        requested_features.use_denoising = params.use_denoising;
 
        return requested_features;
index 44a266dfe18b97d7e93cb830007f81b67c8e8b5b..493e01de36344cc4a7897dd2a73c932e59119b1a 100644 (file)
@@ -503,9 +503,7 @@ void ShaderManager::device_update_common(Device *device,
        KernelIntegrator *kintegrator = &dscene->data.integrator;
        kintegrator->use_volumes = has_volumes;
        /* TODO(sergey): De-duplicate with flags set in integrator.cpp. */
-       if(scene->integrator->transparent_shadows) {
-               kintegrator->transparent_shadows = has_transparent_shadow;
-       }
+       kintegrator->transparent_shadows = has_transparent_shadow;
 }
 
 void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene)
index 43f9a57d09928446083545702610f51b842f1bc3..7f3747a0f5818aa2e901cf8c207042792b1786ae 100644 (file)
@@ -38,6 +38,7 @@ set(SRC_HEADERS
        util_atomic.h
        util_boundbox.h
        util_debug.h
+       util_defines.h
        util_guarded_allocator.cpp
        util_foreach.h
        util_function.h
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
new file mode 100644 (file)
index 0000000..d0d87e7
--- /dev/null
@@ -0,0 +1,134 @@
+
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_DEFINES_H__
+#define __UTIL_DEFINES_H__
+
+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#  define __KERNEL_64_BIT__
+#endif
+
+/* Qualifiers for kernel code shared by CPU and GPU */
+
+#ifndef __KERNEL_GPU__
+#  define ccl_device static inline
+#  define ccl_device_noinline static
+#  define ccl_global
+#  define ccl_constant
+#  define ccl_local
+#  define ccl_local_param
+#  define ccl_private
+#  define ccl_restrict __restrict
+#  define __KERNEL_WITH_SSE_ALIGN__
+
+#  if defined(_WIN32) && !defined(FREE_WINDOWS)
+#    define ccl_device_inline static __forceinline
+#    define ccl_device_forceinline static __forceinline
+#    define ccl_align(...) __declspec(align(__VA_ARGS__))
+#    ifdef __KERNEL_64_BIT__
+#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+#    else  /* __KERNEL_64_BIT__ */
+#      undef __KERNEL_WITH_SSE_ALIGN__
+/* No support for function arguments (error C2719). */
+#      define ccl_try_align(...)
+#    endif  /* __KERNEL_64_BIT__ */
+#    define ccl_may_alias
+#    define ccl_always_inline __forceinline
+#    define ccl_never_inline __declspec(noinline)
+#    define ccl_maybe_unused
+#  else  /* _WIN32 && !FREE_WINDOWS */
+#    define ccl_device_inline static inline __attribute__((always_inline))
+#    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    ifndef FREE_WINDOWS64
+#      define __forceinline inline __attribute__((always_inline))
+#    endif
+#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    define ccl_may_alias __attribute__((__may_alias__))
+#    define ccl_always_inline __attribute__((always_inline))
+#    define ccl_never_inline __attribute__((noinline))
+#    define ccl_maybe_unused __attribute__((used))
+#  endif  /* _WIN32 && !FREE_WINDOWS */
+
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
+#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#  else
+#    define ATTR_FALLTHROUGH ((void)0)
+#  endif
+#endif  /* __KERNEL_GPU__ */
+
+/* macros */
+
+/* hints for branch prediction, only use in code that runs a _lot_ */
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#  define LIKELY(x)       __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x)       (x)
+#  define UNLIKELY(x)     (x)
+#endif
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
+#  define HAS_CPP11_FEATURES
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(HAS_CPP11_FEATURES)
+/* Some magic to be sure we don't have reference in the type. */
+template<typename T> static inline T decltype_helper(T x) { return x; }
+#    define TYPEOF(x) decltype(decltype_helper(x))
+#  else
+#    define TYPEOF(x) typeof(x)
+#  endif
+#endif
+
+/* Causes warning:
+ * incompatible types when assigning to type 'Foo' from type 'Bar'
+ * ... the compiler optimizes away the temp var */
+#ifdef __GNUC__
+#define CHECK_TYPE(var, type)  {  \
+       TYPEOF(var) *__tmp;           \
+       __tmp = (type *)NULL;         \
+       (void)__tmp;                  \
+} (void)0
+
+#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
+       TYPEOF(var_a) *__tmp;                 \
+       __tmp = (typeof(var_b) *)NULL;        \
+       (void)__tmp;                          \
+} (void)0
+#else
+#  define CHECK_TYPE(var, type)
+#  define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
+
+/* can be used in simple macros */
+#define CHECK_TYPE_INLINE(val, type) \
+       ((void)(((type)0) != (val)))
+
+#ifndef __KERNEL_GPU__
+#  include <cassert>
+#  define util_assert(statement)  assert(statement)
+#else
+#  define util_assert(statement)
+#endif
+
+#endif /* __UTIL_DEFINES_H__ */
+
index b719640b19c9891beb79e2080cf7b81cf93df100..4d51ec5570ac14e71ae095826ebdc5ec62d7a2c0 100644 (file)
@@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b)
 #ifndef __KERNEL_GPU__
 using std::isfinite;
 using std::isnan;
+using std::sqrt;
 
 ccl_device_inline int abs(int x)
 {
index bb04c4aa2d929724c2e9fa78772bdd80d14fd45c..e73e5bc17a225cd9b38f7e288dfb7fd82277f509 100644 (file)
@@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f)
 
 ccl_device_inline float3 operator*(const float f, const float3& a)
 {
-       /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
+#if defined(__KERNEL_SSE__)
        return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
 #else
        return make_float3(a.x*f, a.y*f, a.z*f);
@@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a)
 
 ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-       /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-       __m128 rc = _mm_rcp_ps(a.m128);
-       return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#if defined(__KERNEL_SSE__)
+       return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
 #else
        return make_float3(f / a.x, f / a.y, f / a.z);
 #endif
@@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f)
 
 ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-       /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-       __m128 rc = _mm_rcp_ps(b.m128);
-       return float3(_mm_mul_ps(a, rc));
+#if defined(__KERNEL_SSE__)
+       return float3(_mm_div_ps(a.m128, b.m128));
 #else
        return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 #endif
@@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
 ccl_device_inline float3 rcp(const float3& a)
 {
 #ifdef __KERNEL_SSE__
-       const float4 r(_mm_rcp_ps(a.m128));
-       return float3(_mm_sub_ps(_mm_add_ps(r, r),
-                                _mm_mul_ps(_mm_mul_ps(r, r), a)));
+       /* Don't use _mm_rcp_ps due to poor precision. */
+       return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
        return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
 #endif
index d89121b3a1d45853a1f698c806258c937bfbaf8a..adb9a76a4349f46feeeaef7ae188c653ab29fe79 100644 (file)
@@ -48,23 +48,30 @@ ccl_device_inline bool operator==(const float4& a, const float4& b);
 ccl_device_inline float dot(const float4& a, const float4& b);
 ccl_device_inline float len_squared(const float4& a);
 ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 sqrt(const float4& a);
+ccl_device_inline float4 sqr(const float4& a);
 ccl_device_inline float4 cross(const float4& a, const float4& b);
 ccl_device_inline bool is_zero(const float4& a);
-ccl_device_inline float reduce_add(const float4& a);
 ccl_device_inline float average(const float4& a);
 ccl_device_inline float len(const float4& a);
 ccl_device_inline float4 normalize(const float4& a);
 ccl_device_inline float4 safe_normalize(const float4& a);
 ccl_device_inline float4 min(const float4& a, const float4& b);
 ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 fabs(const float4& a);
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4& b);
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b);
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b);
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b);
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
 template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
@@ -77,9 +84,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b);
 ccl_device_inline float4 reduce_min(const float4& a);
 ccl_device_inline float4 reduce_max(const float4& a);
-#  if 0
 ccl_device_inline float4 reduce_add(const float4& a);
-#  endif
 #endif  /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f)
 ccl_device_inline float4 operator/(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
-       return a * rcp(b);
+       return float4(_mm_div_ps(a.m128, b.m128));
 #else
        return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
 #endif
@@ -224,14 +229,30 @@ ccl_device_inline float len_squared(const float4& a)
 ccl_device_inline float4 rcp(const float4& a)
 {
 #ifdef __KERNEL_SSE__
-       float4 r(_mm_rcp_ps(a.m128));
-       return float4(_mm_sub_ps(_mm_add_ps(r, r),
-                                _mm_mul_ps(_mm_mul_ps(r, r), a)));
+       /* Don't use _mm_rcp_ps due to poor precision. */
+       return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
        return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
 #endif
 }
 
+ccl_device_inline float4 sqrt(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+       return float4(_mm_sqrt_ps(a.m128));
+#else
+       return make_float4(sqrtf(a.x),
+                          sqrtf(a.y),
+                          sqrtf(a.z),
+                          sqrtf(a.w));
+#endif
+}
+
+ccl_device_inline float4 sqr(const float4& a)
+{
+       return a * a;
+}
+
 ccl_device_inline float4 cross(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
@@ -254,20 +275,24 @@ ccl_device_inline bool is_zero(const float4& a)
 #endif
 }
 
-ccl_device_inline float reduce_add(const float4& a)
+ccl_device_inline float4 reduce_add(const float4& a)
 {
 #ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE3__
+    float4 h(_mm_hadd_ps(a.m128, a.m128));
+    return float4( _mm_hadd_ps(h.m128, h.m128));
+#  else
        float4 h(shuffle<1,0,3,2>(a) + a);
-       /* TODO(sergey): Investigate efficiency. */
-       return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+       return  shuffle<2,3,0,1>(h) + h;
+#  endif
 #else
-       return ((a.x + a.y) + (a.z + a.w));
+       return make_float4(((a.x + a.y) + (a.z + a.w)));
 #endif
 }
 
 ccl_device_inline float average(const float4& a)
 {
-       return reduce_add(a) * 0.25f;
+       return reduce_add(a)[0] * 0.25f;
 }
 
 ccl_device_inline float len(const float4& a)
@@ -309,6 +334,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
                           max(a.w, b.w));
 #endif
 }
+
+ccl_device_inline float4 fabs(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+       return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#else
+       return make_float4(fabsf(a.x),
+                          fabsf(a.y),
+                          fabsf(a.z),
+                          fabsf(a.w));
+#endif
+}
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
@@ -320,11 +357,28 @@ __forceinline const float4 shuffle(const float4& b)
                                  _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
 }
 
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b)
+{
+       return float4(_mm_shuffle_ps(a.m128, b.m128,
+                                    _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+}
+
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
 {
        return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
 }
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b)
+{
+       return float4(_mm_movelh_ps(a.m128, b.m128));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b)
+{
+       return float4(_mm_movehl_ps(b.m128, a.m128));
+}
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
 {
@@ -344,9 +398,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b)
 {
 #ifdef __KERNEL_SSE__
-       /* TODO(sergey): avoid cvt. */
-       return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a),
-                               _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)));
+       return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
 #else
        return make_float4((mask.x)? a.x: b.x,
                           (mask.y)? a.y: b.y,
@@ -355,6 +407,13 @@ ccl_device_inline float4 select(const int4& mask,
 #endif
 }
 
+ccl_device_inline float4 mask(const int4& mask,
+                              const float4& a)
+{
+       /* Replace elements of x with zero where mask isn't set. */
+       return select(mask, a, make_float4(0.0f));
+}
+
 ccl_device_inline float4 reduce_min(const float4& a)
 {
 #ifdef __KERNEL_SSE__
@@ -375,17 +434,15 @@ ccl_device_inline float4 reduce_max(const float4& a)
 #endif
 }
 
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
+ccl_device_inline float4 load_float4(const float *v)
 {
 #ifdef __KERNEL_SSE__
-       float4 h = shuffle<1,0,3,2>(a) + a;
-       return shuffle<2,3,0,1>(h) + h;
+       return float4(_mm_loadu_ps(v));
 #else
-       return make_float4((a.x + a.y) + (a.z + a.w));
+       return make_float4(v[0], v[1], v[2], v[3]);
 #endif
 }
-#endif
+
 #endif  /* !__KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
index c7511f8306e6d3a59fde2abbca934d2128e26a42..b31dbe4fc670cfd5c365af0253b7cef9b1e47c2b 100644 (file)
@@ -223,20 +223,20 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 {
        const float singular_epsilon = 1e-9f;
 
-       for (int row = 0; row < n; row++) {
-               for (int col = 0; col < n; col++) {
+       for(int row = 0; row < n; row++) {
+               for(int col = 0; col < n; col++) {
                        MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
                }
        }
 
-       for (int sweep = 0; sweep < 8; sweep++) {
+       for(int sweep = 0; sweep < 8; sweep++) {
                float off_diagonal = 0.0f;
-               for (int row = 1; row < n; row++) {
-                       for (int col = 0; col < row; col++) {
+               for(int row = 1; row < n; row++) {
+                       for(int col = 0; col < row; col++) {
                                off_diagonal += fabsf(MAT(A, n, row, col));
                        }
                }
-               if (off_diagonal < 1e-7f) {
+               if(off_diagonal < 1e-7f) {
                        /* The matrix has nearly reached diagonal form.
                         * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
                        break;
@@ -253,7 +253,7 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
                                float abs_element = fabsf(element);
 
                                /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
-                               if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+                               if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
                                        MAT(A, n, row, col) = 0.0f;
                                        continue;
                                }
@@ -272,10 +272,10 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
                                 * Then, we compute sin(phi) and cos(phi) themselves. */
                                float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
                                float ratio;
-                               if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+                               if(abs_element > singular_epsilon*fabsf(singular_diff)) {
                                        float cot_2phi = 0.5f*singular_diff / element;
                                        ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
-                                       if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+                                       if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
                                }
                                else {
                                        ratio = element / singular_diff;
@@ -315,21 +315,21 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
        }
 
        /* Sort eigenvalues and the associated eigenvectors. */
-       for (int i = 0; i < n - 1; i++) {
+       for(int i = 0; i < n - 1; i++) {
                float v = MAT(A, n, i, i);
                int k = i;
-               for (int j = i; j < n; j++) {
-                       if (MAT(A, n, j, j) >= v) {
+               for(int j = i; j < n; j++) {
+                       if(MAT(A, n, j, j) >= v) {
                                v = MAT(A, n, j, j);
                                k = j;
                        }
                }
-               if (k != i) {
+               if(k != i) {
                        /* Swap eigenvalues. */
                        MAT(A, n, k, k) = MAT(A, n, i, i);
                        MAT(A, n, i, i) = v;
                        /* Swap eigenvectors. */
-                       for (int j = 0; j < n; j++) {
+                       for(int j = 0; j < n; j++) {
                                float v = MATS(V, n, i, j, v_stride);
                                MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
                                MATS(V, n, k, j, v_stride) = v;
@@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 }
 
 #ifdef __KERNEL_SSE3__
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
 {
        for(int i = 0; i < n; i++) {
-               A[i] = _mm_setzero_ps();
+               A[i] = make_float4(0.0f);
        }
 }
 
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
 {
        for(int row = 0; row < n; row++) {
                for(int col = 0; col <= row; col++) {
-                       MAT(A, n, row, col) = _mm_setzero_ps();
+                       MAT(A, n, row, col) = make_float4(0.0f);
                }
        }
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
 {
        for(int row = 0; row < n; row++) {
                for(int col = 0; col <= row; col++) {
-                       MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+                       MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
                }
        }
 }
 
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
        for(int i = 0; i < n; i++) {
-               V[i] = _mm_add_ps(V[i], a[i]);
+               V[i] += a[i];
        }
 }
 
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
        for(int i = 0; i < n; i++) {
-               V[i] = _mm_mul_ps(V[i], a[i]);
+               V[i] *= a[i];
        }
 }
 
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
 {
        for(int i = 0; i < n; i++) {
-               a[i] = _mm_max_ps(a[i], b[i]);
+               a[i] = max(a[i], b[i]);
        }
 }
 
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
 {
        for(int row = 0; row < n; row++) {
                for(int col = 0; col <= row; col++) {
-                       MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+                       MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
                }
        }
 }
index 6f70a474fe7f02cb4d25e3e3be48ee6c7565f6fe..0382c0811dd7a58a662b25323504fd8d1b083741 100644 (file)
 
 #ifndef __KERNEL_GPU__
 
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)  || \
-       defined(__KERNEL_SSE3__)  || \
-       defined(__KERNEL_SSSE3__) || \
-       defined(__KERNEL_SSE41__) || \
-       defined(__KERNEL_AVX__)   || \
-       defined(__KERNEL_AVX2__)
-       /* do nothing */
-#endif
-
 /* x86
  *
  * Compile a regular, SSE2 and SSE3 kernel. */
 
 #endif  /* defined(__x86_64__) || defined(_M_X64) */
 
-/* SSE Experiment
- *
- * This is disabled code for an experiment to use SSE types globally for types
- * such as float3 and float4. Currently this gives an overall slowdown. */
-
-#if 0
-#  define __KERNEL_SSE__
-#  ifndef __KERNEL_SSE2__
-#    define __KERNEL_SSE2__
-#  endif
-#  ifndef __KERNEL_SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifndef __KERNEL_SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifndef __KERNEL_SSE4__
-#    define __KERNEL_SSE4__
-#  endif
-#endif
-
-/* SSE Intrinsics includes
- *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-#ifndef FREE_WINDOWS64
-
-#ifdef _MSC_VER
-#  include <intrin.h>
-#elif (defined(__x86_64__) || defined(__i386__))
-#  include <x86intrin.h>
-#endif
-
-#else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
- * Since we can't avoid including <windows.h>, better only include that */
-#include "util/util_windows.h"
-
-#endif
-
 #endif
 
 #endif /* __UTIL_OPTIMIZATION_H__ */
index 587febe3e5238be4a6bb2f215414f35546cc1d46..a2b3247b20769884adf9abd9cfb1ec5d129c993d 100644 (file)
 #ifndef __UTIL_SIMD_TYPES_H__
 #define __UTIL_SIMD_TYPES_H__
 
+#ifndef __KERNEL_GPU__
+
 #include <limits>
 
 #include "util/util_debug.h"
-#include "util/util_types.h"
+#include "util/util_defines.h"
+
+/* SSE Intrinsics includes
+ *
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point */
+
+/* SSE intrinsics headers */
+#ifndef FREE_WINDOWS64
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+#else
+
+/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * Since we can't avoid including <windows.h>, better only include that */
+#include "util/util_windows.h"
+
+#endif
 
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
-struct sseb;
-struct ssei;
-struct ssef;
-
 extern const __m128 _mm_lookupmask_ps[16];
 
 /* Special Types */
@@ -328,12 +347,9 @@ __forceinline size_t __bscf(size_t& v)
 
 #endif /* _WIN32 */
 
-static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
-static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
+#if !(defined(__SSE4_1__) || defined(__SSE4_2__))
 
-#ifdef __KERNEL_SSE3__
-/* Emulation of SSE4 functions with SSE3 */
-#  ifndef __KERNEL_SSE41__
+/* Emulation of SSE4 functions with SSE2 */
 
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -342,48 +358,31 @@ static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 #define _MM_FROUND_CUR_DIRECTION     0x04
 
 #undef _mm_blendv_ps
-#define _mm_blendv_ps __emu_mm_blendv_ps
 __forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { 
     return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); 
 }
 
 #undef _mm_blend_ps
-#define _mm_blend_ps __emu_mm_blend_ps
 __forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { 
     assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); 
 }
 
 #undef _mm_blendv_epi8
-#define _mm_blendv_epi8 __emu_mm_blendv_epi8
 __forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { 
     return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); 
 }
 
-#undef _mm_mullo_epi32
-#define _mm_mullo_epi32 __emu_mm_mullo_epi32
-__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
-  __m128i rvalue;
-  char* _r = (char*)(&rvalue + 1);
-  char* _v = (char*)(& value + 1);
-  char* _i = (char*)(& input + 1);
-  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))*  *((int32_t*)(_i + i));
-  return rvalue;
-}
-
 #undef _mm_min_epi32
-#define _mm_min_epi32 __emu_mm_min_epi32
 __forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_max_epi32
-#define _mm_max_epi32 __emu_mm_max_epi32
 __forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { 
     return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_extract_epi32
-#define _mm_extract_epi32 __emu_mm_extract_epi32
 __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
   switch ( index ) {
   case 0: return _mm_cvtsi128_si32(input);
@@ -395,24 +394,15 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
 }
 
 #undef _mm_insert_epi32
-#define _mm_insert_epi32 __emu_mm_insert_epi32
 __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { 
     assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; 
 }
 
-#undef _mm_extract_ps
-#define _mm_extract_ps __emu_mm_extract_ps
-__forceinline int _mm_extract_ps( __m128 input, const int index ) {
-  int32_t* ptr = (int32_t*)&input; return ptr[index];
-}
-
 #undef _mm_insert_ps
-#define _mm_insert_ps __emu_mm_insert_ps
 __forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
 { assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
 
 #undef _mm_round_ps
-#define _mm_round_ps __emu_mm_round_ps
 __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
 {
   switch ( flags )
@@ -425,57 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
   return value;
 }
 
-#    ifdef _M_X64
-#undef _mm_insert_epi64
-#define _mm_insert_epi64 __emu_mm_insert_epi64
-__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { 
-    assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; 
-}
-
-#undef _mm_extract_epi64
-#define _mm_extract_epi64 __emu_mm_extract_epi64
-__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { 
-    assert(size_t(index) < 2); 
-    return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); 
-}
-#    endif
-
-#  endif
-
-#undef _mm_fabs_ps
-#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
-
-/* Return a __m128 with every element set to the largest element of v. */
-ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
-{
-  /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
-  v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
-  /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
-  v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
-  return v;
-}
-
-/* Return the sum of the four elements of x. */
-ccl_device_inline float _mm_hsum_ss(__m128 x)
-{
-    __m128 a = _mm_movehdup_ps(x);
-    __m128 b = _mm_add_ps(x, a);
-    return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
-}
-
-/* Return a __m128 with every element set to the sum of the four elements of x. */
-ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
-{
-    x = _mm_hadd_ps(x, x);
-    x = _mm_hadd_ps(x, x);
-    return x;
-}
-
-/* Replace elements of x with zero where mask isn't set. */
-#undef _mm_mask_ps
-#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
-
-#endif
+#endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
 #else  /* __KERNEL_SSE2__ */
 
@@ -496,13 +436,19 @@ ccl_device_inline int bitscan(int value)
 
 #endif /* __KERNEL_SSE2__ */
 
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)  || \
+       defined(__KERNEL_SSE3__)  || \
+       defined(__KERNEL_SSSE3__) || \
+       defined(__KERNEL_SSE41__) || \
+       defined(__KERNEL_AVX__)   || \
+       defined(__KERNEL_AVX2__)
+       /* do nothing */
+#endif
+
 CCL_NAMESPACE_END
 
-#include "util/util_math.h"
-#include "util/util_sseb.h"
-#include "util/util_ssei.h"
-#include "util/util_ssef.h"
-#include "util/util_avxf.h"
+#endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 
index 6e669701f3bd9630293b90e275cba470961c1d60..93c22aafdcd1c894d1f1b33970d105d4aca76ad7 100644 (file)
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct ssei;
+struct ssef;
+
 /*! 4-wide SSE bool type. */
 struct sseb
 {
index cf99a08efae1215059e37fe4e588dd14d26b9e51..bb007ff84a92cbf580a3b837e8d5e8bb1c1451e5 100644 (file)
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE float type. */
 struct ssef
 {
index 5f62569268cd80443010de592fbd41b492a50d04..ef2a9e68b7db74928fee256064d920b68f15e436 100644 (file)
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE integer type. */
 struct ssei
 {
@@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a
 
 #else
 
-__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); }
-__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); }
+__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; }
+__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; }
+__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); }
+__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); }
 __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; }
 
 #endif
index a5d1d7152d54bdfb1a3e0c41a0cb1c066f2c64cd..d9642df8005e254ff5cbb087da1393482a6713dd 100644 (file)
 #  include <stdlib.h>
 #endif
 
-/* Bitness */
+/* Standard Integer Types */
 
-#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
-#  define __KERNEL_64_BIT__
+#if !defined(__KERNEL_GPU__) && !defined(_WIN32)
+#  include <stdint.h>
 #endif
 
-/* Qualifiers for kernel code shared by CPU and GPU */
-
-#ifndef __KERNEL_GPU__
-#  define ccl_device static inline
-#  define ccl_device_noinline static
-#  define ccl_global
-#  define ccl_constant
-#  define ccl_local
-#  define ccl_local_param
-#  define ccl_private
-#  define ccl_restrict __restrict
-#  define __KERNEL_WITH_SSE_ALIGN__
-
-#  if defined(_WIN32) && !defined(FREE_WINDOWS)
-#    define ccl_device_inline static __forceinline
-#    define ccl_device_forceinline static __forceinline
-#    define ccl_align(...) __declspec(align(__VA_ARGS__))
-#    ifdef __KERNEL_64_BIT__
-#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#    else  /* __KERNEL_64_BIT__ */
-#      undef __KERNEL_WITH_SSE_ALIGN__
-/* No support for function arguments (error C2719). */
-#      define ccl_try_align(...)
-#    endif  /* __KERNEL_64_BIT__ */
-#    define ccl_may_alias
-#    define ccl_always_inline __forceinline
-#    define ccl_never_inline __declspec(noinline)
-#    define ccl_maybe_unused
-#  else  /* _WIN32 && !FREE_WINDOWS */
-#    define ccl_device_inline static inline __attribute__((always_inline))
-#    define ccl_device_forceinline static inline __attribute__((always_inline))
-#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#    ifndef FREE_WINDOWS64
-#      define __forceinline inline __attribute__((always_inline))
-#    endif
-#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#    define ccl_may_alias __attribute__((__may_alias__))
-#    define ccl_always_inline __attribute__((always_inline))
-#    define ccl_never_inline __attribute__((noinline))
-#    define ccl_maybe_unused __attribute__((used))
-#  endif  /* _WIN32 && !FREE_WINDOWS */
-
-/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
-#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
-#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
-#  else
-#    define ATTR_FALLTHROUGH ((void)0)
-#  endif
-#endif  /* __KERNEL_GPU__ */
-
-/* Standard Integer Types */
+#include "util/util_defines.h"
 
 #ifndef __KERNEL_GPU__
-/* int8_t, uint16_t, and friends */
-#  ifndef _WIN32
-#    include <stdint.h>
-#  endif
-/* SIMD Types */
-#  include "util/util_optimization.h"
-#endif  /* __KERNEL_GPU__ */
+#  include "util/util_simd.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -201,65 +146,8 @@ enum ExtensionType {
        EXTENSION_NUM_TYPES,
 };
 
-/* macros */
-
-/* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
-#  define LIKELY(x)       __builtin_expect(!!(x), 1)
-#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
-#else
-#  define LIKELY(x)       (x)
-#  define UNLIKELY(x)     (x)
-#endif
-
-#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
-#  define HAS_CPP11_FEATURES
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#  if defined(HAS_CPP11_FEATURES)
-/* Some magic to be sure we don't have reference in the type. */
-template<typename T> static inline T decltype_helper(T x) { return x; }
-#    define TYPEOF(x) decltype(decltype_helper(x))
-#  else
-#    define TYPEOF(x) typeof(x)
-#  endif
-#endif
-
-/* Causes warning:
- * incompatible types when assigning to type 'Foo' from type 'Bar'
- * ... the compiler optimizes away the temp var */
-#ifdef __GNUC__
-#define CHECK_TYPE(var, type)  {  \
-       TYPEOF(var) *__tmp;           \
-       __tmp = (type *)NULL;         \
-       (void)__tmp;                  \
-} (void)0
-
-#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
-       TYPEOF(var_a) *__tmp;                 \
-       __tmp = (typeof(var_b) *)NULL;        \
-       (void)__tmp;                          \
-} (void)0
-#else
-#  define CHECK_TYPE(var, type)
-#  define CHECK_TYPE_PAIR(var_a, var_b)
-#endif
-
-/* can be used in simple macros */
-#define CHECK_TYPE_INLINE(val, type) \
-       ((void)(((type)0) != (val)))
-
-
 CCL_NAMESPACE_END
 
-#ifndef __KERNEL_GPU__
-#  include <cassert>
-#  define util_assert(statement)  assert(statement)
-#else
-#  define util_assert(statement)
-#endif
-
 /* Vectorized types declaration. */
 #include "util/util_types_uchar2.h"
 #include "util/util_types_uchar3.h"
@@ -298,5 +186,13 @@ CCL_NAMESPACE_END
 
 #include "util/util_types_vector3_impl.h"
 
+/* SSE types. */
+#ifndef __KERNEL_GPU__
+#  include "util/util_sseb.h"
+#  include "util/util_ssei.h"
+#  include "util/util_ssef.h"
+#  include "util/util_avxf.h"
+#endif
+
 #endif /* __UTIL_TYPES_H__ */
 
index 9af3ebf3cbbcdc934ea2eb3cb86ab5a0026db582..1872fe108cada16bf266a601a3b2e6fc36b74f78 100644 (file)
@@ -821,7 +821,8 @@ static void clip_keymap(struct wmKeyConfig *keyconf)
 #endif
 }
 
-static const char *clip_context_dir[] = {"edit_movieclip", "edit_mask", NULL};
+/* DO NOT make this static, this hides the symbol and breaks API generation script. */
+const char *clip_context_dir[] = {"edit_movieclip", "edit_mask", NULL};
 
 static int clip_context(const bContext *C, const char *member, bContextDataResult *result)
 {
index 5dfcba9b4d1e2bfe04d2fbacd90e1a50eda99099..3b04e6c80cdd73e16ef1bb2982290774f899dfb6 100644 (file)
@@ -436,7 +436,8 @@ static void sequencer_dropboxes(void)
 
 /* ************* end drop *********** */
 
-static const char *sequencer_context_dir[] = {"edit_mask", NULL};
+/* DO NOT make this static, this hides the symbol and breaks API generation script. */
+const char *sequencer_context_dir[] = {"edit_mask", NULL};
 
 static int sequencer_context(const bContext *C, const char *member, bContextDataResult *result)
 {
index cedf50a3035e20ef310163c83186d84b51019d66..221baeadbee874c0872324d3432655874d7016c1 100644 (file)
@@ -729,20 +729,20 @@ int wm_homefile_read(
        if (filepath_startup_override != NULL) {
                /* pass */
        }
-       else if (app_template_override && app_template_override[0]) {
+       else if (app_template_override) {
+               /* This may be clearing the current template by setting to an empty string. */
                app_template = app_template_override;
        }
        else if (!use_factory_settings && U.app_template[0]) {
                app_template = U.app_template;
        }
 
-       if (app_template != NULL) {
+       if ((app_template != NULL) && (app_template[0] != '\0')) {
                BKE_appdir_app_template_id_search(app_template, app_template_system, sizeof(app_template_system));
                BLI_path_join(app_template_config, sizeof(app_template_config), cfgdir, app_template, NULL);
-       }
 
-       /* insert template name into startup file */
-       if (app_template != NULL) {
+               /* Insert template name into startup file. */
+
                /* note that the path is being set even when 'use_factory_settings == true'
                 * this is done so we can load a templates factory-settings */
                if (!use_factory_settings) {
index 28ce095b0e81c42795eee09de679c05e9c5726db..afd61d5ab6741a437773653bb9259fbea4cfa3e4 100644 (file)
@@ -518,6 +518,7 @@ if(WITH_CYCLES)
                                        -blender "$<TARGET_FILE:blender>"
                                        -testdir "${TEST_SRC_DIR}/cycles/ctests/${subject}"
                                        -idiff "${OPENIMAGEIO_IDIFF}"
+                                       -outdir "${TEST_OUT_DIR}/cycles"
                                )
                        else()
                                add_test(
@@ -526,17 +527,23 @@ if(WITH_CYCLES)
                                        -blender "$<TARGET_FILE:blender>"
                                        -testdir "${TEST_SRC_DIR}/cycles/ctests/${subject}"
                                        -idiff "${OPENIMAGEIO_IDIFF}"
+                                       -outdir "${TEST_OUT_DIR}/cycles"
                                )
                        endif()
                endmacro()
                if(WITH_OPENGL_TESTS)
                        add_cycles_render_test(opengl)
                endif()
-               add_cycles_render_test(image)
+               add_cycles_render_test(displacement)
+               add_cycles_render_test(image_data_types)
+               add_cycles_render_test(image_mapping)
+               add_cycles_render_test(image_texture_limit)
                add_cycles_render_test(mblur)
                add_cycles_render_test(reports)
                add_cycles_render_test(render)
                add_cycles_render_test(shader)
+               add_cycles_render_test(shadow_catcher)
+               add_cycles_render_test(volume)
        else()
                MESSAGE(STATUS "Disabling Cycles tests because tests folder does not exist")
        endif()
index a030cc5e0de0e9db68be785cebc4d312c6ee95ec..ea84f27ab7e5b466fded600eecd6c784a4cd2d26 100755 (executable)
@@ -2,7 +2,9 @@
 # Apache License, Version 2.0
 
 import argparse
+import glob
 import os
+import pathlib
 import shutil
 import subprocess
 import sys
@@ -24,7 +26,7 @@ class COLORS_DUMMY:
 COLORS = COLORS_DUMMY
 
 
-def printMessage(type, status, message):
+def print_message(message, type=None, status=''):
     if type == 'SUCCESS':
         print(COLORS.GREEN, end="")
     elif type == 'FAILURE':
@@ -109,20 +111,126 @@ def test_get_name(filepath):
     filename = os.path.basename(filepath)
     return os.path.splitext(filename)[0]
 
-
-def verify_output(filepath):
+def test_get_images(filepath):
     testname = test_get_name(filepath)
     dirpath = os.path.dirname(filepath)
-    reference_dirpath = os.path.join(dirpath, "reference_renders")
-    reference_image = os.path.join(reference_dirpath, testname + ".png")
-    failed_image = os.path.join(reference_dirpath, testname + ".fail.png")
-    if not os.path.exists(reference_image):
+    ref_dirpath = os.path.join(dirpath, "reference_renders")
+    ref_img = os.path.join(ref_dirpath, testname + ".png")
+    new_dirpath = os.path.join(OUTDIR, os.path.basename(dirpath))
+    if not os.path.exists(new_dirpath):
+        os.makedirs(new_dirpath)
+    new_img = os.path.join(new_dirpath, testname + ".png")
+    diff_dirpath = os.path.join(OUTDIR, os.path.basename(dirpath), "diff")
+    if not os.path.exists(diff_dirpath):
+        os.makedirs(diff_dirpath)
+    diff_img = os.path.join(diff_dirpath, testname + ".diff.png")
+    return ref_img, new_img, diff_img
+
+
+class Report:
+    def __init__(self, testname):
+        self.failed_tests = ""
+        self.passed_tests = ""
+        self.testname = testname
+
+    def output(self):
+        # write intermediate data for single test
+        outdir = os.path.join(OUTDIR, self.testname)
+        f = open(os.path.join(outdir, "failed.data"), "w")
+        f.write(self.failed_tests)
+        f.close()
+
+        f = open(os.path.join(outdir, "passed.data"), "w")
+        f.write(self.passed_tests)
+        f.close()
+
+        # gather intermediate data for all tests
+        failed_data = sorted(glob.glob(os.path.join(OUTDIR, "*/failed.data")))
+        passed_data = sorted(glob.glob(os.path.join(OUTDIR, "*/passed.data")))
+
+        failed_tests = ""
+        passed_tests = ""
+
+        for filename in failed_data:
+            failed_tests += open(os.path.join(OUTDIR, filename), "r").read()
+        for filename in passed_data:
+            passed_tests += open(os.path.join(OUTDIR, filename), "r").read()
+
+        # write html for all tests
+        self.html = """
+<html>
+<head>
+    <title>Cycles Test Report</title>
+    <style>
+        img {{ image-rendering: pixelated; width: 256; background-color: #000; }}
+        table td:first-child {{ width: 100%; }}
+    </style>
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css">
+</head>
+<body>
+    <div class="container">
+        <br/>
+        <h1>Cycles Test Report</h1>
+        <br/>
+        <table class="table table-striped">
+            <thead class="thead-default">
+                <tr><th>Name</th><th>New</th><th>Reference</th><th>Diff</th>
+            </thead>
+            {}{}
+        </table>
+        <br/>
+    </div>
+</body>
+</html>
+            """ . format(failed_tests, passed_tests)
+
+        filepath = os.path.join(OUTDIR, "report.html")
+        f = open(filepath, "w")
+        f.write(self.html)
+        f.close()
+
+        print_message("Report saved to: " + pathlib.Path(filepath).as_uri())
+
+    def add_test(self, filepath, error):
+        name = test_get_name(filepath)
+
+        ref_img, new_img, diff_img = test_get_images(filepath)
+
+        status = error if error else ""
+        style = """ style="background-color: #f99;" """ if error else ""
+
+        new_url = pathlib.Path(new_img).as_uri()
+        ref_url = pathlib.Path(ref_img).as_uri()
+        diff_url = pathlib.Path(diff_img).as_uri()
+
+        test_html = """
+            <tr{}>
+                <td><b>{}</b><br/>{}<br/>{}</td>
+                <td><img src="{}" onmouseover="this.src='{}';" onmouseout="this.src='{}';"></td>
+                <td><img src="{}" onmouseover="this.src='{}';" onmouseout="this.src='{}';"></td>
+                <td><img src="{}"></td>
+            </tr>""" . format(style, name, self.testname, status,
+                              new_url, ref_url, new_url,
+                              ref_url, new_url, ref_url,
+                              diff_url)
+
+        if error:
+            self.failed_tests += test_html
+        else:
+            self.passed_tests += test_html
+
+
+def verify_output(report, filepath):
+    ref_img, new_img, diff_img = test_get_images(filepath)
+    if not os.path.exists(ref_img):
         return False
+
+    # diff test with threshold
     command = (
         IDIFF,
-        "-fail", "0.015",
+        "-fail", "0.016",
         "-failpercent", "1",
-        reference_image,
+        ref_img,
         TEMP_FILE,
         )
     try:
@@ -130,47 +238,66 @@ def verify_output(filepath):
         failed = False
     except subprocess.CalledProcessError as e:
         if VERBOSE:
-            print(e.output.decode("utf-8"))
+            print_message(e.output.decode("utf-8"))
         failed = e.returncode != 1
-    if failed:
-        shutil.copy(TEMP_FILE, failed_image)
-    elif os.path.exists(failed_image):
-        os.remove(failed_image)
+
+    # generate diff image
+    command = (
+        IDIFF,
+        "-o", diff_img,
+        "-abs", "-scale", "16",
+        ref_img,
+        TEMP_FILE
+        )
+
+    try:
+        subprocess.check_output(command)
+    except subprocess.CalledProcessError as e:
+        if VERBOSE:
+            print_message(e.output.decode("utf-8"))
+
+    # copy new image
+    if os.path.exists(new_img):
+        os.remove(new_img)
+    if os.path.exists(TEMP_FILE):
+        shutil.copy(TEMP_FILE, new_img)
+
     return not failed
 
 
-def run_test(filepath):
+def run_test(report, filepath):
     testname = test_get_name(filepath)
     spacer = "." * (32 - len(testname))
-    printMessage('SUCCESS', 'RUN', testname)
+    print_message(testname, 'SUCCESS', 'RUN')
     time_start = time.time()
     error = render_file(filepath)
     status = "FAIL"
     if not error:
-        if not verify_output(filepath):
+        if not verify_output(report, filepath):
             error = "VERIFY"
     time_end = time.time()
     elapsed_ms = int((time_end - time_start) * 1000)
     if not error:
-        printMessage('SUCCESS', 'OK', "{} ({} ms)" .
-                     format(testname, elapsed_ms))
+        print_message("{} ({} ms)" . format(testname, elapsed_ms),
+                      'SUCCESS', 'OK')
     else:
         if error == "NO_CYCLES":
-            print("Can't perform tests because Cycles failed to load!")
-            return False
+            print_message("Can't perform tests because Cycles failed to load!")
+            return error
         elif error == "NO_START":
-            print('Can not perform tests because blender fails to start.',
+            print_message('Can not perform tests because blender fails to start.',
                   'Make sure INSTALL target was run.')
-            return False
+            return error
         elif error == 'VERIFY':
-            print("Rendered result is different from reference image")
+            print_message("Rendered result is different from reference image")
         else:
-            print("Unknown error %r" % error)
-        printMessage('FAILURE', 'FAILED', "{} ({} ms)" .
-                     format(testname, elapsed_ms))
+            print_message("Unknown error %r" % error)
+        print_message("{} ({} ms)" . format(testname, elapsed_ms),
+                      'FAILURE', 'FAILED')
     return error
 
 
+
 def blend_list(path):
     for dirpath, dirnames, filenames in os.walk(path):
         for filename in filenames:
@@ -178,17 +305,18 @@ def blend_list(path):
                 filepath = os.path.join(dirpath, filename)
                 yield filepath
 
-
 def run_all_tests(dirpath):
     passed_tests = []
     failed_tests = []
     all_files = list(blend_list(dirpath))
     all_files.sort()
-    printMessage('SUCCESS', "==========",
-                 "Running {} tests from 1 test case." . format(len(all_files)))
+    report = Report(os.path.basename(dirpath))
+    print_message("Running {} tests from 1 test case." .
+                  format(len(all_files)),
+                  'SUCCESS', "==========")
     time_start = time.time()
     for filepath in all_files:
-        error = run_test(filepath)
+        error = run_test(report, filepath)
         testname = test_get_name(filepath)
         if error:
             if error == "NO_CYCLES":
@@ -198,28 +326,33 @@ def run_all_tests(dirpath):
             failed_tests.append(testname)
         else:
             passed_tests.append(testname)
+        report.add_test(filepath, error)
     time_end = time.time()
     elapsed_ms = int((time_end - time_start) * 1000)
-    print("")
-    printMessage('SUCCESS', "==========",
-                 "{} tests from 1 test case ran. ({} ms total)" .
-                 format(len(all_files), elapsed_ms))
-    printMessage('SUCCESS', 'PASSED', "{} tests." .
-                 format(len(passed_tests)))
+    print_message("")
+    print_message("{} tests from 1 test case ran. ({} ms total)" .
+                  format(len(all_files), elapsed_ms),
+                  'SUCCESS', "==========")
+    print_message("{} tests." .
+                  format(len(passed_tests)),
+                  'SUCCESS', 'PASSED')
     if failed_tests:
-        printMessage('FAILURE', 'FAILED', "{} tests, listed below:" .
-                     format(len(failed_tests)))
+        print_message("{} tests, listed below:" .
+                     format(len(failed_tests)),
+                     'FAILURE', 'FAILED')
         failed_tests.sort()
         for test in failed_tests:
-            printMessage('FAILURE', "FAILED", "{}" . format(test))
-        return False
-    return True
+            print_message("{}" . format(test), 'FAILURE', "FAILED")
+
+    report.output()
+    return not bool(failed_tests)
 
 
 def create_argparse():
     parser = argparse.ArgumentParser()
     parser.add_argument("-blender", nargs="+")
     parser.add_argument("-testdir", nargs=1)
+    parser.add_argument("-outdir", nargs=1)
     parser.add_argument("-idiff", nargs=1)
     return parser
 
@@ -229,7 +362,7 @@ def main():
     args = parser.parse_args()
 
     global COLORS
-    global BLENDER, ROOT, IDIFF
+    global BLENDER, TESTDIR, IDIFF, OUTDIR
     global TEMP_FILE, TEMP_FILE_MASK, TEST_SCRIPT
     global VERBOSE
 
@@ -237,8 +370,12 @@ def main():
         COLORS = COLORS_ANSI
 
     BLENDER = args.blender[0]
-    ROOT = args.testdir[0]
+    TESTDIR = args.testdir[0]
     IDIFF = args.idiff[0]
+    OUTDIR = args.outdir[0]
+
+    if not os.path.exists(OUTDIR):
+        os.makedirs(OUTDIR)
 
     TEMP = tempfile.mkdtemp()
     TEMP_FILE_MASK = os.path.join(TEMP, "test")
@@ -248,7 +385,7 @@ def main():
 
     VERBOSE = os.environ.get("BLENDER_VERBOSE") is not None
 
-    ok = run_all_tests(ROOT)
+    ok = run_all_tests(TESTDIR)
 
     # Cleanup temp files and folders
     if os.path.exists(TEMP_FILE):