Cycles: Add BVH8 and packeted triangle intersection
authorSergey Sharybin <sergey.vfx@gmail.com>
Wed, 14 Feb 2018 10:23:30 +0000 (11:23 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Wed, 29 Aug 2018 13:03:09 +0000 (15:03 +0200)
This is an initial implementation of BVH8 optimization structure
and packated triangle intersection. The aim is to get faster ray
to scene intersection checks.

    Scene                BVH4      BVH8
barbershop_interior    10:24.94   10:10.74
bmw27                  02:41.25   02:38.83
classroom              08:16.49   07:56.15
fishy_cat              04:24.56   04:17.29
koro                   06:03.06   06:01.45
pavillon_barcelona     09:21.26   09:02.98
victor                 23:39.65   22:53.71

As memory goes, peak usage raises by about 4.7% in a complex
scenes.

Note that BVH8 is disabled when using OSL, this is because OSL
kernel does not get per-microarchitecture optimizations and
hence always considers BVH3 is used.

Original BVH8 patch from Anton Gavrikov.
Batched triangles intersection from Victoria Zhislina.
Extra work and tests and fixes from Maxym Dmytrychenko.

37 files changed:
intern/cycles/blender/addon/properties.py
intern/cycles/blender/blender_sync.cpp
intern/cycles/bvh/CMakeLists.txt
intern/cycles/bvh/bvh.cpp
intern/cycles/bvh/bvh.h
intern/cycles/bvh/bvh2.cpp
intern/cycles/bvh/bvh4.cpp
intern/cycles/bvh/bvh8.cpp [new file with mode: 0644]
intern/cycles/bvh/bvh8.h [new file with mode: 0644]
intern/cycles/bvh/bvh_node.cpp
intern/cycles/bvh/bvh_node.h
intern/cycles/device/device_cpu.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/bvh/bvh.h
intern/cycles/kernel/bvh/bvh_local.h
intern/cycles/kernel/bvh/bvh_shadow_all.h
intern/cycles/kernel/bvh/bvh_traversal.h
intern/cycles/kernel/bvh/bvh_types.h
intern/cycles/kernel/bvh/bvh_volume.h
intern/cycles/kernel/bvh/bvh_volume_all.h
intern/cycles/kernel/bvh/obvh_local.h [new file with mode: 0644]
intern/cycles/kernel/bvh/obvh_nodes.h [new file with mode: 0644]
intern/cycles/kernel/bvh/obvh_shadow_all.h [new file with mode: 0644]
intern/cycles/kernel/bvh/obvh_traversal.h [new file with mode: 0644]
intern/cycles/kernel/bvh/obvh_volume.h [new file with mode: 0644]
intern/cycles/kernel/bvh/obvh_volume_all.h [new file with mode: 0644]
intern/cycles/kernel/bvh/qbvh_nodes.h
intern/cycles/kernel/geom/geom_triangle_intersect.h
intern/cycles/kernel/kernel_compat_cpu.h
intern/cycles/kernel/kernel_types.h
intern/cycles/util/CMakeLists.txt
intern/cycles/util/util_avxb.h [new file with mode: 0644]
intern/cycles/util/util_avxf.h
intern/cycles/util/util_debug.cpp
intern/cycles/util/util_types.h
intern/cycles/util/util_types_float8.h [new file with mode: 0644]
intern/cycles/util/util_types_float8_impl.h [new file with mode: 0644]

index c97d942..80b83c9 100644 (file)
@@ -50,6 +50,7 @@ enum_displacement_methods = (
 enum_bvh_layouts = (
     ('BVH2', "BVH2", "", 1),
     ('BVH4', "BVH4", "", 2),
+    ('BVH8', "BVH8", "", 4),
 )
 
 enum_bvh_types = (
@@ -686,7 +687,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         cls.debug_bvh_layout = EnumProperty(
             name="BVH Layout",
             items=enum_bvh_layouts,
-            default='BVH4',
+            default='BVH8',
         )
         cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
index fedcb87..fcdadf0 100644 (file)
@@ -679,7 +679,15 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
                params.texture_limit = 0;
        }
 
-       params.bvh_layout = DebugFlags().cpu.bvh_layout;
+       /* TODO(sergey): Once OSL supports per-microarchitecture optimization get
+        * rid of this.
+        */
+       if (params.shadingsystem == SHADINGSYSTEM_OSL) {
+               params.bvh_layout = BVH_LAYOUT_BVH4;
+       }
+       else {
+               params.bvh_layout = DebugFlags().cpu.bvh_layout;
+       }
 
        return params;
 }
index b8171e7..fcd2857 100644 (file)
@@ -10,6 +10,7 @@ set(SRC
        bvh.cpp
        bvh2.cpp
        bvh4.cpp
+       bvh8.cpp
        bvh_binning.cpp
        bvh_build.cpp
        bvh_node.cpp
@@ -22,6 +23,7 @@ set(SRC_HEADERS
        bvh.h
        bvh2.h
        bvh4.h
+       bvh8.h
        bvh_binning.h
        bvh_build.h
        bvh_node.h
index b524ca0..bc73a3a 100644 (file)
@@ -22,6 +22,7 @@
 
 #include "bvh/bvh2.h"
 #include "bvh/bvh4.h"
+#include "bvh/bvh8.h"
 #include "bvh/bvh_build.h"
 #include "bvh/bvh_node.h"
 
@@ -38,6 +39,7 @@ const char *bvh_layout_name(BVHLayout layout)
        switch(layout) {
                case BVH_LAYOUT_BVH2: return "BVH2";
                case BVH_LAYOUT_BVH4: return "BVH4";
+               case BVH_LAYOUT_BVH8: return "BVH8";
                case BVH_LAYOUT_NONE: return "NONE";
                case BVH_LAYOUT_ALL:  return "ALL";
        }
@@ -92,6 +94,8 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
                        return new BVH2(params, objects);
                case BVH_LAYOUT_BVH4:
                        return new BVH4(params, objects);
+               case BVH_LAYOUT_BVH8:
+                       return new BVH8(params, objects);
                case BVH_LAYOUT_NONE:
                case BVH_LAYOUT_ALL:
                        break;
@@ -215,6 +219,38 @@ void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility)
                        }
                }
                visibility |= ob->visibility_for_tracing();
+
+       }
+}
+
+bool BVH::leaf_check(const BVHNode *node, BVH_TYPE bvh)
+{
+       if(node->is_leaf()) {
+               return node->is_unaligned;
+       }
+       else {
+               return node_is_unaligned(node, bvh);
+       }
+}
+
+bool BVH::node_is_unaligned(const BVHNode *node, BVH_TYPE bvh)
+{
+       const BVHNode *node0 = node->get_child(0);
+       const BVHNode *node1 = node->get_child(1);
+
+       switch(bvh) {
+               case bvh2:
+                       return node0->is_unaligned || node1->is_unaligned;
+                       break;
+               case bvh4:
+                       return leaf_check(node0, bvh2) || leaf_check(node1, bvh2);
+                       break;
+               case bvh8:
+                       return leaf_check(node0, bvh4) || leaf_check(node1, bvh4);
+                       break;
+               default:
+                       assert(0);
+                       return false;
        }
 }
 
@@ -291,8 +327,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
         * BVH's are stored in global arrays. This function merges them into the
         * top level BVH, adjusting indexes and offsets where appropriate.
         */
-       /* TODO(sergey): This code needs adjustment for wider BVH than 4. */
        const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
+       const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
 
        /* Adjust primitive index to point to the triangle in the global array, for
         * meshes with transform applied and already in the top level BVH.
@@ -469,14 +505,26 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
                        for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
                                size_t nsize, nsize_bbox;
                                if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
-                                       nsize = use_qbvh
-                                                   ? BVH_UNALIGNED_QNODE_SIZE
-                                                   : BVH_UNALIGNED_NODE_SIZE;
-                                       nsize_bbox = (use_qbvh)? 13: 0;
+                                       if(use_obvh) {
+                                               nsize = BVH_UNALIGNED_ONODE_SIZE;
+                                               nsize_bbox = BVH_UNALIGNED_ONODE_SIZE-1;
+                                       }
+                                       else {
+                                               nsize = use_qbvh
+                                                       ? BVH_UNALIGNED_QNODE_SIZE
+                                                       : BVH_UNALIGNED_NODE_SIZE;
+                                               nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE-1 : 0;
+                                       }
                                }
                                else {
-                                       nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
-                                       nsize_bbox = (use_qbvh)? 7: 0;
+                                       if(use_obvh) {
+                                               nsize = BVH_ONODE_SIZE;
+                                               nsize_bbox = BVH_ONODE_SIZE-1;
+                                       }
+                                       else {
+                                               nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE;
+                                               nsize_bbox = (use_qbvh)? BVH_QNODE_SIZE-1 : 0;
+                                       }
                                }
 
                                memcpy(pack_nodes + pack_nodes_offset,
@@ -485,16 +533,29 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
                                /* Modify offsets into arrays */
                                int4 data = bvh_nodes[i + nsize_bbox];
-
-                               data.z += (data.z < 0)? -noffset_leaf: noffset;
-                               data.w += (data.w < 0)? -noffset_leaf: noffset;
-
-                               if(use_qbvh) {
-                                       data.x += (data.x < 0)? -noffset_leaf: noffset;
-                                       data.y += (data.y < 0)? -noffset_leaf: noffset;
+                               int4 data1 = bvh_nodes[i + nsize_bbox-1];
+                               if(use_obvh) {
+                                       data.z += (data.z < 0) ? -noffset_leaf : noffset;
+                                       data.w += (data.w < 0) ? -noffset_leaf : noffset;
+                                       data.x += (data.x < 0) ? -noffset_leaf : noffset;
+                                       data.y += (data.y < 0) ? -noffset_leaf : noffset;
+                                       data1.z += (data1.z < 0) ? -noffset_leaf : noffset;
+                                       data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
+                                       data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
+                                       data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
+                               }
+                               else {
+                                       data.z += (data.z < 0) ? -noffset_leaf : noffset;
+                                       data.w += (data.w < 0) ? -noffset_leaf : noffset;
+                                       if(use_qbvh) {
+                                               data.x += (data.x < 0)? -noffset_leaf: noffset;
+                                               data.y += (data.y < 0)? -noffset_leaf: noffset;
+                                       }
                                }
-
                                pack_nodes[pack_nodes_offset + nsize_bbox] = data;
+                               if(use_obvh) {
+                                       pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
+                               }
 
                                /* Usually this copies nothing, but we better
                                 * be prepared for possible node size extension.
index 6a82f91..86be0ba 100644 (file)
@@ -73,6 +73,12 @@ struct PackedBVH {
        }
 };
 
+enum BVH_TYPE {
+       bvh2,
+       bvh4,
+       bvh8
+};
+
 /* BVH */
 
 class BVH
@@ -93,6 +99,8 @@ protected:
 
        /* Refit range of primitives. */
        void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility);
+       static __forceinline bool leaf_check(const BVHNode *node, BVH_TYPE bvh);
+       static bool node_is_unaligned(const BVHNode *node, BVH_TYPE bvh);
 
        /* triangles and strands */
        void pack_primitives();
index 9d89d2b..4a423c1 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-static bool node_bvh_is_unaligned(const BVHNode *node)
-{
-       const BVHNode *node0 = node->get_child(0),
-                     *node1 = node->get_child(1);
-       return node0->is_unaligned || node1->is_unaligned;
-}
-
 BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
@@ -195,7 +188,7 @@ void BVH2::pack_nodes(const BVHNode *root)
        }
        else {
                stack.push_back(BVHStackEntry(root, nextNodeIdx));
-               nextNodeIdx += node_bvh_is_unaligned(root)
+               nextNodeIdx += node_is_unaligned(root, bvh2)
                                       ? BVH_UNALIGNED_NODE_SIZE
                                       : BVH_NODE_SIZE;
        }
@@ -218,7 +211,7 @@ void BVH2::pack_nodes(const BVHNode *root)
                                }
                                else {
                                        idx[i] = nextNodeIdx;
-                                       nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i))
+                                       nextNodeIdx += node_is_unaligned(e.node->get_child(i), bvh2)
                                                               ? BVH_UNALIGNED_NODE_SIZE
                                                               : BVH_NODE_SIZE;
                                }
index 4faf47a..a449587 100644 (file)
@@ -30,27 +30,6 @@ CCL_NAMESPACE_BEGIN
  * Perhaps we can merge nodes in actual tree and make our
  * life easier all over the place.
  */
-static bool node_qbvh_is_unaligned(const BVHNode *node)
-{
-       const BVHNode *node0 = node->get_child(0),
-                     *node1 = node->get_child(1);
-       bool has_unaligned = false;
-       if(node0->is_leaf()) {
-               has_unaligned |= node0->is_unaligned;
-       }
-       else {
-               has_unaligned |= node0->get_child(0)->is_unaligned;
-               has_unaligned |= node0->get_child(1)->is_unaligned;
-       }
-       if(node1->is_leaf()) {
-               has_unaligned |= node1->is_unaligned;
-       }
-       else {
-               has_unaligned |= node1->get_child(0)->is_unaligned;
-               has_unaligned |= node1->get_child(1)->is_unaligned;
-       }
-       return has_unaligned;
-}
 
 BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
@@ -304,7 +283,7 @@ void BVH4::pack_nodes(const BVHNode *root)
        }
        else {
                stack.push_back(BVHStackEntry(root, nextNodeIdx));
-               nextNodeIdx += node_qbvh_is_unaligned(root)
+               nextNodeIdx += node_is_unaligned(root, bvh4)
                                       ? BVH_UNALIGNED_QNODE_SIZE
                                       : BVH_QNODE_SIZE;
        }
@@ -348,7 +327,7 @@ void BVH4::pack_nodes(const BVHNode *root)
                                }
                                else {
                                        idx = nextNodeIdx;
-                                       nextNodeIdx += node_qbvh_is_unaligned(nodes[i])
+                                       nextNodeIdx += node_is_unaligned(nodes[i], bvh4)
                                                               ? BVH_UNALIGNED_QNODE_SIZE
                                                               : BVH_QNODE_SIZE;
                                }
@@ -438,7 +417,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
                                            visibility,
                                            0.0f,
                                            1.0f,
-                                           4);
+                                           num_nodes);
                }
                else {
                        pack_aligned_node(idx,
@@ -447,7 +426,7 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
                                          visibility,
                                          0.0f,
                                          1.0f,
-                                         4);
+                                         num_nodes);
                }
        }
 }
diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp
new file mode 100644 (file)
index 0000000..e8b6726
--- /dev/null
@@ -0,0 +1,515 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "bvh/bvh8.h"
+
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_unaligned.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVH8::BVH8(const BVHParams& params_, const vector<Object*>& objects_)
+: BVH(params_, objects_)
+{
+}
+
+void BVH8::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
+{
+       float4 data[BVH_ONODE_LEAF_SIZE];
+       memset(data, 0, sizeof(data));
+       if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
+               /* object */
+               data[0].x = __int_as_float(~(leaf->lo));
+               data[0].y = __int_as_float(0);
+       }
+       else {
+               /* triangle */
+               data[0].x = __int_as_float(leaf->lo);
+               data[0].y = __int_as_float(leaf->hi);
+       }
+       data[0].z = __uint_as_float(leaf->visibility);
+       if(leaf->num_triangles() != 0) {
+               data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
+       }
+
+       memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
+}
+
+void BVH8::pack_inner(const BVHStackEntry& e,
+                      const BVHStackEntry *en,
+                      int num)
+{
+       bool has_unaligned = false;
+       /* Check whether we have to create unaligned node or all nodes are aligned
+        * and we can cut some corner here.
+        */
+       if(params.use_unaligned_nodes) {
+               for(int i = 0; i < num; i++) {
+                       if(en[i].node->is_unaligned) {
+                               has_unaligned = true;
+                               break;
+                       }
+               }
+       }
+       if(has_unaligned) {
+               /* There's no unaligned children, pack into AABB node. */
+               pack_unaligned_inner(e, en, num);
+       }
+       else {
+               /* Create unaligned node with orientation transform for each of the
+                * children.
+                */
+               pack_aligned_inner(e, en, num);
+       }
+}
+
+void BVH8::pack_aligned_inner(const BVHStackEntry& e,
+                              const BVHStackEntry *en,
+                              int num)
+{
+       BoundBox bounds[8];
+       int child[8];
+       for(int i = 0; i < num; ++i) {
+               bounds[i] = en[i].node->bounds;
+               child[i] = en[i].encodeIdx();
+       }
+       pack_aligned_node(e.idx,
+                         bounds,
+                         child,
+                         e.node->visibility,
+                         e.node->time_from,
+                         e.node->time_to,
+                         num);
+}
+
+void BVH8::pack_aligned_node(int idx,
+                             const BoundBox *bounds,
+                             const int *child,
+                             const uint visibility,
+                             const float time_from,
+                             const float time_to,
+                             const int num)
+{
+       float8 data[8];
+       memset(data, 0, sizeof(data));
+
+       data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
+       data[0].b = time_from;
+       data[0].c = time_to;
+       for(int i = 0; i < num; i++) {
+               float3 bb_min = bounds[i].min;
+               float3 bb_max = bounds[i].max;
+
+               data[1][i] = bb_min.x;
+               data[2][i] = bb_max.x;
+               data[3][i] = bb_min.y;
+               data[4][i] = bb_max.y;
+               data[5][i] = bb_min.z;
+               data[6][i] = bb_max.z;
+
+               data[7][i] = __int_as_float(child[i]);
+       }
+
+       for(int i = num; i < 8; i++) {
+               /* We store BB which would never be recorded as intersection
+               * so kernel might safely assume there are always 4 child nodes.
+               */
+               data[1][i] = FLT_MAX;
+               data[2][i] = -FLT_MAX;
+
+               data[3][i] = FLT_MAX;
+               data[4][i] = -FLT_MAX;
+
+               data[5][i] = FLT_MAX;
+               data[6][i] = -FLT_MAX;
+
+               data[7][i] = __int_as_float(0);
+       }
+       memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE);
+}
+
+void BVH8::pack_unaligned_inner(const BVHStackEntry& e,
+                                const BVHStackEntry *en,
+                                int num)
+{
+       Transform aligned_space[8];
+       BoundBox bounds[8];
+       int child[8];
+       for(int i = 0; i < num; ++i) {
+               aligned_space[i] = en[i].node->get_aligned_space();
+               bounds[i] = en[i].node->bounds;
+               child[i] = en[i].encodeIdx();
+       }
+       pack_unaligned_node(e.idx,
+                           aligned_space,
+                           bounds,
+                           child,
+                           e.node->visibility,
+                           e.node->time_from,
+                           e.node->time_to,
+                           num);
+}
+
+void BVH8::pack_unaligned_node(int idx,
+                               const Transform *aligned_space,
+                               const BoundBox *bounds,
+                               const int *child,
+                               const uint visibility,
+                               const float time_from,
+                               const float time_to,
+                               const int num)
+{
+       float8 data[BVH_UNALIGNED_ONODE_SIZE];
+       memset(data, 0, sizeof(data));
+       data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
+       data[0].b = time_from;
+       data[0].c = time_to;
+
+       for(int i = 0; i < num; i++) {
+               Transform space = BVHUnaligned::compute_node_transform(
+                       bounds[i],
+                       aligned_space[i]);
+
+               data[1][i] = space.x.x;
+               data[2][i] = space.x.y;
+               data[3][i] = space.x.z;
+
+               data[4][i] = space.y.x;
+               data[5][i] = space.y.y;
+               data[6][i] = space.y.z;
+
+               data[7][i] = space.z.x;
+               data[8][i] = space.z.y;
+               data[9][i] = space.z.z;
+
+               data[10][i] = space.x.w;
+               data[11][i] = space.y.w;
+               data[12][i] = space.z.w;
+
+               data[13][i] = __int_as_float(child[i]);
+       }
+
+       for(int i = num; i < 8; i++) {
+               /* We store BB which would never be recorded as intersection
+                * so kernel might safely assume there are always 4 child nodes.
+                */
+
+               data[1][i] = 1.0f;
+               data[2][i] = 0.0f;
+               data[3][i] = 0.0f;
+
+               data[4][i] = 0.0f;
+               data[5][i] = 0.0f;
+               data[6][i] = 0.0f;
+
+               data[7][i] = 0.0f;
+               data[8][i] = 0.0f;
+               data[9][i] = 0.0f;
+
+               data[10][i] = -FLT_MAX;
+               data[11][i] = -FLT_MAX;
+               data[12][i] = -FLT_MAX;
+
+               data[13][i] = __int_as_float(0);
+       }
+
+       memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_ONODE_SIZE);
+}
+
+/* Quad SIMD Nodes */
+
+void BVH8::pack_nodes(const BVHNode *root)
+{
+       /* Calculate size of the arrays required. */
+       const size_t num_nodes = root->getSubtreeSize(BVH_STAT_ONODE_COUNT);
+       const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
+       assert(num_leaf_nodes <= num_nodes);
+       const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
+       size_t node_size;
+       if(params.use_unaligned_nodes) {
+               const size_t num_unaligned_nodes =
+                       root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_ONODE_COUNT);
+               node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) +
+                       (num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE;
+       }
+       else {
+               node_size = num_inner_nodes * BVH_ONODE_SIZE;
+       }
+       /* Resize arrays. */
+       pack.nodes.clear();
+       pack.leaf_nodes.clear();
+       /* For top level BVH, first merge existing BVH's so we know the offsets. */
+       if(params.top_level) {
+               pack_instances(node_size, num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
+       }
+       else {
+               pack.nodes.resize(node_size);
+               pack.leaf_nodes.resize(num_leaf_nodes*BVH_ONODE_LEAF_SIZE);
+       }
+
+       int nextNodeIdx = 0, nextLeafNodeIdx = 0;
+
+       vector<BVHStackEntry> stack;
+       stack.reserve(BVHParams::MAX_DEPTH*2);
+       if(root->is_leaf()) {
+               stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
+       }
+       else {
+               stack.push_back(BVHStackEntry(root, nextNodeIdx));
+               nextNodeIdx += node_is_unaligned(root, bvh8)
+                                  ? BVH_UNALIGNED_ONODE_SIZE
+                                  : BVH_ONODE_SIZE;
+       }
+
+       while(stack.size()) {
+               BVHStackEntry e = stack.back();
+               stack.pop_back();
+
+               if(e.node->is_leaf()) {
+                       /* leaf node */
+                       const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node);
+                       pack_leaf(e, leaf);
+               }
+               else {
+                       /* Inner node. */
+                       const BVHNode *node = e.node;
+                       const BVHNode *node0 = node->get_child(0);
+                       const BVHNode *node1 = node->get_child(1);
+                       /* Collect nodes. */
+                       const BVHNode *nodes[8];
+                       int numnodes = 0;
+                       if(node0->is_leaf()) {
+                               nodes[numnodes++] = node0;
+                       }
+                       else {
+                               const BVHNode *node00 = node0->get_child(0),
+                                             *node01 = node0->get_child(1);
+                               if(node00->is_leaf()) {
+                                       nodes[numnodes++] = node00;
+                               }
+                               else {
+                                       nodes[numnodes++] = node00->get_child(0);
+                                       nodes[numnodes++] = node00->get_child(1);
+                               }
+                               if(node01->is_leaf()) {
+                                       nodes[numnodes++] = node01;
+                               }
+                               else {
+                                       nodes[numnodes++] = node01->get_child(0);
+                                       nodes[numnodes++] = node01->get_child(1);
+                               }
+                       }
+                       if(node1->is_leaf()) {
+                               nodes[numnodes++] = node1;
+                       }
+                       else {
+                               const BVHNode *node10 = node1->get_child(0),
+                                             *node11 = node1->get_child(1);
+                               if(node10->is_leaf()) {
+                                       nodes[numnodes++] = node10;
+                               }
+                               else {
+                                       nodes[numnodes++] = node10->get_child(0);
+                                       nodes[numnodes++] = node10->get_child(1);
+                               }
+                               if(node11->is_leaf()) {
+                                       nodes[numnodes++] = node11;
+                               }
+                               else {
+                                       nodes[numnodes++] = node11->get_child(0);
+                                       nodes[numnodes++] = node11->get_child(1);
+                               }
+                       }
+                       /* Push entries on the stack. */
+                       for(int i = 0; i < numnodes; ++i) {
+                               int idx;
+                               if(nodes[i]->is_leaf()) {
+                                       idx = nextLeafNodeIdx++;
+                               }
+                               else {
+                                       idx = nextNodeIdx;
+                                       nextNodeIdx += node_is_unaligned(nodes[i], bvh8)
+                                               ? BVH_UNALIGNED_ONODE_SIZE
+                                               : BVH_ONODE_SIZE; 
+                               }
+                               stack.push_back(BVHStackEntry(nodes[i], idx));
+                       }
+                       /* Set node. */
+                       pack_inner(e, &stack[stack.size() - numnodes], numnodes);
+               }
+       }
+       assert(node_size == nextNodeIdx);
+       /* Root index to start traversal at, to handle case of single leaf node. */
+       pack.root_index = (root->is_leaf()) ? -1 : 0;
+}
+
+void BVH8::refit_nodes()
+{
+       assert(!params.top_level);
+
+       BoundBox bbox = BoundBox::empty;
+       uint visibility = 0;
+       refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
+}
+
+void BVH8::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+{
+       if(leaf) {
+               int4 *data = &pack.leaf_nodes[idx];
+               int4 c = data[0];
+               /* Refit leaf node. */
+               for(int prim = c.x; prim < c.y; prim++) {
+                       int pidx = pack.prim_index[prim];
+                       int tob = pack.prim_object[prim];
+                       Object *ob = objects[tob];
+
+                       if(pidx == -1) {
+                               /* Object instance. */
+                               bbox.grow(ob->bounds);
+                       }
+                       else {
+                               /* Primitives. */
+                               const Mesh *mesh = ob->mesh;
+
+                               if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+                                       /* Curves. */
+                                       int str_offset = (params.top_level) ? mesh->curve_offset : 0;
+                                       Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+                                       int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+                                       curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+
+                                       visibility |= PATH_RAY_CURVE;
+
+                                       /* Motion curves. */
+                                       if(mesh->use_motion_blur) {
+                                               Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+                                               if(attr) {
+                                                       size_t mesh_size = mesh->curve_keys.size();
+                                                       size_t steps = mesh->motion_steps - 1;
+                                                       float3 *key_steps = attr->data_float3();
+
+                                                       for(size_t i = 0; i < steps; i++) {
+                                                               curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
+                                                       }
+                                               }
+                                       }
+                               }
+                               else {
+                                       /* Triangles. */
+                                       int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
+                                       Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+                                       const float3 *vpos = &mesh->verts[0];
+
+                                       triangle.bounds_grow(vpos, bbox);
+
+                                       /* Motion triangles. */
+                                       if(mesh->use_motion_blur) {
+                                               Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+                                               if(attr) {
+                                                       size_t mesh_size = mesh->verts.size();
+                                                       size_t steps = mesh->motion_steps - 1;
+                                                       float3 *vert_steps = attr->data_float3();
+
+                                                       for(size_t i = 0; i < steps; i++) {
+                                                               triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+
+                       visibility |= ob->visibility;
+               }
+
+               float4 leaf_data[BVH_ONODE_LEAF_SIZE];
+               leaf_data[0].x = __int_as_float(c.x);
+               leaf_data[0].y = __int_as_float(c.y);
+               leaf_data[0].z = __uint_as_float(visibility);
+               leaf_data[0].w = __uint_as_float(c.w);
+               memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_ONODE_LEAF_SIZE);
+       }
+       else {
+               int4 *data = &pack.nodes[idx];
+               bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
+               int4 c;
+               if(is_unaligned) {
+                       c = data[BVH_UNALIGNED_ONODE_SIZE-1];
+               }
+               else {
+                       c = data[BVH_ONODE_SIZE-1];
+               }
+               /* Refit inner node, set bbox from children. */
+               BoundBox child_bbox[8] = { BoundBox::empty, BoundBox::empty,
+                                          BoundBox::empty, BoundBox::empty,
+                                          BoundBox::empty, BoundBox::empty,
+                                          BoundBox::empty, BoundBox::empty };
+               uint child_visibility[8] = { 0 };
+               int num_nodes = 0;
+
+               for(int i = 0; i < 8; ++i) {
+                       if(c[i] != 0) {
+                               refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0),
+                                          child_bbox[i], child_visibility[i]);
+                               ++num_nodes;
+                               bbox.grow(child_bbox[i]);
+                               visibility |= child_visibility[i];
+                       }
+               }
+
+               if(is_unaligned) {
+                       Transform aligned_space[8] = { transform_identity(), transform_identity(),
+                                                      transform_identity(), transform_identity(),
+                                                      transform_identity(), transform_identity(),
+                                                      transform_identity(), transform_identity()};
+                       pack_unaligned_node(idx,
+                                           aligned_space,
+                                           child_bbox,
+                                           &c[0],
+                                           visibility,
+                                           0.0f,
+                                           1.0f,
+                                           num_nodes);
+               }
+               else {
+                       pack_aligned_node(idx,
+                                         child_bbox,
+                                         &c[0],
+                                         visibility,
+                                         0.0f,
+                                         1.0f,
+                                         num_nodes);
+               }
+       }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h
new file mode 100644 (file)
index 0000000..99ff79e
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __BVH8_H__
+#define __BVH8_H__
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_params.h"
+
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHNode;
+struct BVHStackEntry;
+class BVHParams;
+class BoundBox;
+class LeafNode;
+class Object;
+class Progress;
+
+#define BVH_ONODE_SIZE           16
+#define BVH_ONODE_LEAF_SIZE      1
+#define BVH_UNALIGNED_ONODE_SIZE 28
+
+/* BVH8
+*
+* Octo BVH, with each node having eight children, to use with SIMD instructions.
+*/
+class BVH8 : public BVH {
+protected:
+       /* constructor */
+       friend class BVH;
+       BVH8(const BVHParams& params, const vector<Object*>& objects);
+
+       /* pack */
+       void pack_nodes(const BVHNode *root);
+
+       void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf);
+       void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num);
+
+       void pack_aligned_inner(const BVHStackEntry& e,
+                               const BVHStackEntry *en,
+                               int num);
+       void pack_aligned_node(int idx,
+                              const BoundBox *bounds,
+                              const int *child,
+                              const uint visibility,
+                              const float time_from,
+                              const float time_to,
+                              const int num);
+
+       void pack_unaligned_inner(const BVHStackEntry& e,
+                                 const BVHStackEntry *en,
+                                 int num);
+       void pack_unaligned_node(int idx,
+                                const Transform *aligned_space,
+                                const BoundBox *bounds,
+                                const int *child,
+                                const uint visibility,
+                                const float time_from,
+                                const float time_to,
+                                const int num);
+
+       /* refit */
+       void refit_nodes();
+       void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BVH8_H__ */
index 879d07b..fd8b497 100644 (file)
@@ -61,6 +61,55 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
                                }
                        }
                        return cnt;
+               case BVH_STAT_ONODE_COUNT:
+                       cnt = 1;
+                       for(int i = 0; i < num_children(); i++) {
+                               BVHNode *node = get_child(i);
+                               if(node->is_leaf()) {
+                                       cnt += 1;
+                               }
+                               else {
+                                       for(int j = 0; j < node->num_children(); j++)
+                                       {
+                                               BVHNode *node_next = node->get_child(j);
+                                               if(node_next->is_leaf()) {
+                                                       cnt += 1;
+                                               }
+                                               else {
+                                                       for(int k = 0; k < node_next->num_children(); k++) {
+                                                               cnt += node_next->get_child(k)->getSubtreeSize(stat);
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       return cnt;
+               case BVH_STAT_UNALIGNED_INNER_ONODE_COUNT:
+                       {
+                               bool has_unaligned = false;
+                               for(int i = 0; i < num_children(); i++) {
+                                       BVHNode *node = get_child(i);
+                                       if(node->is_leaf()) {
+                                               has_unaligned |= node->is_unaligned;
+                                       }
+                                       else {
+                                               for(int j = 0; j < node->num_children(); j++) {
+                                                       BVHNode *node_next = node->get_child(j);
+                                                       if(node_next->is_leaf()) {
+                                                               has_unaligned |= node_next->is_unaligned;
+                                                       }
+                                                       else {
+                                                               for(int k = 0; k < node_next->num_children(); k++) {
+                                                                       cnt += node_next->get_child(k)->getSubtreeSize(stat);
+                                                                       has_unaligned |= node_next->get_child(k)->is_unaligned;
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                               cnt += has_unaligned? 1: 0;
+                       }
+                       return cnt;
                case BVH_STAT_ALIGNED_COUNT:
                        if(!is_unaligned) {
                                cnt = 1;
index 94cf5ab..ed89d52 100644 (file)
@@ -39,6 +39,8 @@ enum BVH_STAT {
        BVH_STAT_ALIGNED_LEAF_COUNT,
        BVH_STAT_UNALIGNED_LEAF_COUNT,
        BVH_STAT_DEPTH,
+       BVH_STAT_ONODE_COUNT,
+       BVH_STAT_UNALIGNED_INNER_ONODE_COUNT,
 };
 
 class BVHParams;
index e92bbbf..5b3761f 100644 (file)
@@ -1031,6 +1031,9 @@ void device_cpu_info(vector<DeviceInfo>& devices)
        if(system_cpu_support_sse2()) {
                info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
        }
+       if (system_cpu_support_avx2()) {
+               info.bvh_layout_mask |= BVH_LAYOUT_BVH8;
+       }
        info.has_volume_decoupled = true;
        info.has_osl = true;
        info.has_half_images = true;
index 118e183..c6e92c6 100644 (file)
@@ -76,6 +76,12 @@ set(SRC_BVH_HEADERS
        bvh/qbvh_traversal.h
        bvh/qbvh_volume.h
        bvh/qbvh_volume_all.h
+       bvh/obvh_nodes.h
+       bvh/obvh_shadow_all.h
+       bvh/obvh_local.h
+       bvh/obvh_traversal.h
+       bvh/obvh_volume.h
+       bvh/obvh_volume_all.h
 )
 
 set(SRC_HEADERS
@@ -270,6 +276,8 @@ set(SRC_UTIL_HEADERS
        ../util/util_types_float3_impl.h
        ../util/util_types_float4.h
        ../util/util_types_float4_impl.h
+       ../util/util_types_float8.h
+       ../util/util_types_float8_impl.h
        ../util/util_types_int2.h
        ../util/util_types_int2_impl.h
        ../util/util_types_int3.h
index de1503e..2ad55d0 100644 (file)
@@ -32,6 +32,9 @@ CCL_NAMESPACE_BEGIN
 /* Common QBVH functions. */
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_nodes.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_nodes.h"
+#endif
 #endif
 
 /* Regular BVH traversal */
index 6356c19..8def71b 100644 (file)
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_local.h"
+#  ifdef __KERNEL_AVX2__
+#    include "kernel/bvh/obvh_local.h"
+#  endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -254,6 +257,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          int max_hits)
 {
        switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+               case BVH_LAYOUT_BVH8:
+                       return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+                                                           ray,
+                                                           local_isect,
+                                                           local_object,
+                                                           lcg_state,
+                                                           max_hits);
+#endif
 #ifdef __QBVH__
                case BVH_LAYOUT_BVH4:
                        return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
index cfc567f..d525b29 100644 (file)
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_shadow_all.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_shadow_all.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -396,6 +399,15 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          uint *num_hits)
 {
        switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+               case BVH_LAYOUT_BVH8:
+                       return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+                                                           ray,
+                                                           isect_array,
+                                                           visibility,
+                                                           max_hits,
+                                                           num_hits);
+#endif
 #ifdef __QBVH__
                case BVH_LAYOUT_BVH4:
                        return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
index 551625e..e95d240 100644 (file)
@@ -20,6 +20,9 @@
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_traversal.h"
 #endif
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_traversal.h"
+#endif
 
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
@@ -427,6 +430,19 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          )
 {
        switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+               case BVH_LAYOUT_BVH8:
+                       return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+                                                           ray,
+                                                           isect,
+                                                           visibility
+#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                                           , lcg_state,
+                                                           difl,
+                                                           extmax
+#  endif
+                                                           );
+#endif
 #ifdef __QBVH__
                case BVH_LAYOUT_BVH4:
                        return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
index ead424a..4ca0dc2 100644 (file)
@@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
 #define BVH_STACK_SIZE 192
 #define BVH_QSTACK_SIZE 384
-
+#define BVH_OSTACK_SIZE 768
 /* BVH intersection function variations */
 
 #define BVH_INSTANCING                 1
index ce5fc7b..7d03855 100644 (file)
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_volume.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_volume.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -310,6 +313,13 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint visibility)
 {
        switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+               case BVH_LAYOUT_BVH8:
+                       return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+                                                           ray,
+                                                           isect,
+                                                           visibility);
+#endif
 #ifdef __QBVH__
                case BVH_LAYOUT_BVH4:
                        return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
index 2ee29ac..3d9b598 100644 (file)
@@ -19,6 +19,9 @@
 
 #ifdef __QBVH__
 #  include "kernel/bvh/qbvh_volume_all.h"
+#ifdef __KERNEL_AVX2__
+#  include "kernel/bvh/obvh_volume_all.h"
+#endif
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -386,6 +389,14 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint visibility)
 {
        switch(kernel_data.bvh.bvh_layout) {
+#ifdef __KERNEL_AVX2__
+               case BVH_LAYOUT_BVH8:
+                       return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
+                                                           ray,
+                                                           isect_array,
+                                                           max_hits,
+                                                           visibility);
+#endif
 #ifdef __QBVH__
                case BVH_LAYOUT_BVH4:
                        return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h
new file mode 100644 (file)
index 0000000..50bcfa7
--- /dev/null
@@ -0,0 +1,409 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             LocalIntersection *local_isect,
+                                             int local_object,
+                                             uint *lcg_state,
+                                             int max_hits)
+{
+       /* Traversal stack in CUDA thread-local memory. */
+       OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+       traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+       /* Traversal variables in registers. */
+       int stack_ptr = 0;
+       int node_addr = kernel_tex_fetch(__object_node, local_object);
+
+       /* Ray parameters in registers. */
+       float3 P = ray->P;
+       float3 dir = bvh_clamp_direction(ray->D);
+       float3 idir = bvh_inverse_direction(dir);
+       int object = OBJECT_NONE;
+       float isect_t = ray->t;
+
+       local_isect->num_hits = 0;
+
+       const int object_flag = kernel_tex_fetch(__object_flag, local_object);
+       if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+#if BVH_FEATURE(BVH_MOTION)
+               Transform ob_itfm;
+               isect_t = bvh_instance_motion_push(kg,
+                                                  local_object,
+                                                  ray,
+                                                  &P,
+                                                  &dir,
+                                                  &idir,
+                                                  isect_t,
+                                                  &ob_itfm);
+#else
+               isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+#endif
+               object = local_object;
+       }
+
+#ifndef __KERNEL_SSE41__
+       if(!isfinite(P.x)) {
+               return false;
+       }
+#endif
+
+       avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+       avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+       avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+       float3 P_idir = P*idir;
+       avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+       avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+       /* Offsets to select the side that becomes the lower or upper bound. */
+       int near_x, near_y, near_z;
+       int far_x, far_y, far_z;
+       obvh_near_far_idx_calc(idir,
+                              &near_x, &near_y, &near_z,
+                              &far_x, &far_y, &far_z);
+
+       /* Traversal loop. */
+       do {
+               do {
+                       /* Traverse internal nodes. */
+                       while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+                               avxf dist;
+                               int child_mask = NODE_INTERSECT(kg,
+                                                               tnear,
+                                                               tfar,
+#ifdef __KERNEL_AVX2__
+                                                               P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                                                               org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+                                                               dir4,
+#endif
+                                                               idir4,
+                                                               near_x, near_y, near_z,
+                                                               far_x, far_y, far_z,
+                                                               node_addr,
+                                                               &dist);
+
+                               if(child_mask != 0) {
+                                       float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+                                       avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+                                       if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+                                       }
+                                       else
+#endif
+                                       {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+                                       }
+
+                                       /* One child is hit, continue with that child. */
+                                       int r = __bscf(child_mask);
+                                       if(child_mask == 0) {
+                                               node_addr = __float_as_int(cnodes[r]);
+                                               continue;
+                                       }
+
+                                       /* Two children are hit, push far child, and continue with
+                                        * closer child.
+                                        */
+                                       int c0 = __float_as_int(cnodes[r]);
+                                       float d0 = ((float*)&dist)[r];
+                                       r = __bscf(child_mask);
+                                       int c1 = __float_as_int(cnodes[r]);
+                                       float d1 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               if(d1 < d0) {
+                                                       node_addr = c1;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c0;
+                                                       traversal_stack[stack_ptr].dist = d0;
+                                                       continue;
+                                               }
+                                               else {
+                                                       node_addr = c0;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c1;
+                                                       traversal_stack[stack_ptr].dist = d1;
+                                                       continue;
+                                               }
+                                       }
+
+                                       /* Here starts the slow path for 3 or 4 hit children. We push
+                                        * all nodes onto the stack to sort them there.
+                                        */
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c1;
+                                       traversal_stack[stack_ptr].dist = d1;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c0;
+                                       traversal_stack[stack_ptr].dist = d0;
+
+                                       /* Three children are hit, push all onto stack and sort 3
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c2 = __float_as_int(cnodes[r]);
+                                       float d2 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Four children are hit, push all onto stack and sort 4
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c3 = __float_as_int(cnodes[r]);
+                                       float d3 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c3;
+                                               traversal_stack[stack_ptr].dist = d3;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c3;
+                                       traversal_stack[stack_ptr].dist = d3;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c2;
+                                       traversal_stack[stack_ptr].dist = d2;
+
+                                       /* Five children are hit, push all onto stack and sort 5
+                                        * stack items, continue with closest child
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c4 = __float_as_int(cnodes[r]);
+                                       float d4 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+                                       /* Six children are hit, push all onto stack and sort 6
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c5 = __float_as_int(cnodes[r]);
+                                       float d5 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c5;
+                                               traversal_stack[stack_ptr].dist = d5;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c5;
+                                       traversal_stack[stack_ptr].dist = d5;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c4;
+                                       traversal_stack[stack_ptr].dist = d4;
+
+                                       /* Seven children are hit, push all onto stack and sort 7
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c6 = __float_as_int(cnodes[r]);
+                                       float d6 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c6;
+                                               traversal_stack[stack_ptr].dist = d6;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5],
+                                                               &traversal_stack[stack_ptr - 6]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+                                       /* Eight children are hit, push all onto stack and sort 8
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c7 = __float_as_int(cnodes[r]);
+                                       float d7 = ((float*)&dist)[r];
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c7;
+                                       traversal_stack[stack_ptr].dist = d7;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c6;
+                                       traversal_stack[stack_ptr].dist = d6;
+                                       obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                       &traversal_stack[stack_ptr - 1],
+                                                       &traversal_stack[stack_ptr - 2],
+                                                       &traversal_stack[stack_ptr - 3],
+                                                       &traversal_stack[stack_ptr - 4],
+                                                       &traversal_stack[stack_ptr - 5],
+                                                       &traversal_stack[stack_ptr - 6],
+                                                       &traversal_stack[stack_ptr - 7]);
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+                                       continue;
+                               }
+
+                               node_addr = traversal_stack[stack_ptr].addr;
+                               --stack_ptr;
+                       }
+
+                       /* If node is leaf, fetch triangle list. */
+                       if(node_addr < 0) {
+                               float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+                               int prim_addr = __float_as_int(leaf.x);
+
+                               int prim_addr2 = __float_as_int(leaf.y);
+                               const uint type = __float_as_int(leaf.w);
+
+                               /* Pop. */
+                               node_addr = traversal_stack[stack_ptr].addr;
+                               --stack_ptr;
+
+                               /* Primitive intersection. */
+                               switch(type & PRIMITIVE_ALL) {
+                                       case PRIMITIVE_TRIANGLE: {
+                                               /* Intersect ray against primitive, */
+                                               for(; prim_addr < prim_addr2; prim_addr++) {
+                                                       kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                                                       if(triangle_intersect_local(kg,
+                                                                                   local_isect,
+                                                                                   P,
+                                                                                   dir,
+                                                                                   object,
+                                                                                   local_object,
+                                                                                   prim_addr,
+                                                                                   isect_t,
+                                                                                   lcg_state,
+                                                                                   max_hits))
+                                                       {
+                                                               return true;
+                                                       }
+                                               }
+                                               break;
+                                       }
+#if BVH_FEATURE(BVH_MOTION)
+                                       case PRIMITIVE_MOTION_TRIANGLE: {
+                                               /* Intersect ray against primitive. */
+                                               for(; prim_addr < prim_addr2; prim_addr++) {
+                                                       kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                                                       if(motion_triangle_intersect_local(kg,
+                                                                                          local_isect,
+                                                                                          P,
+                                                                                          dir,
+                                                                                          ray->time,
+                                                                                          object,
+                                                                                          local_object,
+                                                                                          prim_addr,
+                                                                                          isect_t,
+                                                                                          lcg_state,
+                                                                                          max_hits))
+                                                       {
+                                                               return true;
+                                                       }
+                                               }
+                                               break;
+                                       }
+#endif
+                                       default:
+                                               break;
+                               }
+                       }
+               } while(node_addr != ENTRYPOINT_SENTINEL);
+       } while(node_addr != ENTRYPOINT_SENTINEL);
+       return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h
new file mode 100644 (file)
index 0000000..93f35f6
--- /dev/null
@@ -0,0 +1,532 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Aligned nodes intersection AVX code is adopted from Embree,
+ */
+
+struct OBVHStackItem {
+       int addr;
+       float dist;
+};
+
+ccl_device_inline void obvh_near_far_idx_calc(const float3& idir,
+                                              int *ccl_restrict near_x,
+                                              int *ccl_restrict near_y,
+                                              int *ccl_restrict near_z,
+                                              int *ccl_restrict far_x,
+                                              int *ccl_restrict far_y,
+                                              int *ccl_restrict far_z)
+
+{
+#ifdef __KERNEL_SSE__
+       *near_x = 0; *far_x = 1;
+       *near_y = 2; *far_y = 3;
+       *near_z = 4; *far_z = 5;
+
+       const size_t mask = movemask(ssef(idir.m128));
+
+       const int mask_x = mask & 1;
+       const int mask_y = (mask & 2) >> 1;
+       const int mask_z = (mask & 4) >> 2;
+
+       *near_x += mask_x; *far_x -= mask_x;
+       *near_y += mask_y; *far_y -= mask_y;
+       *near_z += mask_z; *far_z -= mask_z;
+#else
+       if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; }
+       if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; }
+       if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; }
+#endif
+}
+
+ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a,
+                                      OBVHStackItem *ccl_restrict b)
+{
+       OBVHStackItem tmp = *a;
+       *a = *b;
+       *b = tmp;
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3)
+{
+       if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+       if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
+       if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4)
+{
+       if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); }
+       if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); }
+       if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); }
+       if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); }
+       if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5)
+{
+       obvh_stack_sort(s1, s2, s3, s4);
+       if(s5->dist < s4->dist) {
+               obvh_item_swap(s4, s5);
+               if(s4->dist < s3->dist) {
+                       obvh_item_swap(s3, s4);
+                       if(s3->dist < s2->dist) {
+                               obvh_item_swap(s2, s3);
+                               if(s2->dist < s1->dist) {
+                                       obvh_item_swap(s1, s2);
+                               }
+                       }
+               }
+       }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6)
+{
+       obvh_stack_sort(s1, s2, s3, s4, s5);
+       if(s6->dist < s5->dist) {
+               obvh_item_swap(s5, s6);
+               if(s5->dist < s4->dist) {
+                       obvh_item_swap(s4, s5);
+                       if(s4->dist < s3->dist) {
+                               obvh_item_swap(s3, s4);
+                               if(s3->dist < s2->dist) {
+                                       obvh_item_swap(s2, s3);
+                                       if(s2->dist < s1->dist) {
+                                               obvh_item_swap(s1, s2);
+                                       }
+                               }
+                       }
+               }
+       }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6,
+                                       OBVHStackItem *ccl_restrict s7)
+{
+       obvh_stack_sort(s1, s2, s3, s4, s5, s6);
+       if(s7->dist < s6->dist) {
+               obvh_item_swap(s6, s7);
+               if(s6->dist < s5->dist) {
+                       obvh_item_swap(s5, s6);
+                       if(s5->dist < s4->dist) {
+                               obvh_item_swap(s4, s5);
+                               if(s4->dist < s3->dist) {
+                                       obvh_item_swap(s3, s4);
+                                       if(s3->dist < s2->dist) {
+                                               obvh_item_swap(s2, s3);
+                                               if(s2->dist < s1->dist) {
+                                                       obvh_item_swap(s1, s2);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+}
+
+ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
+                                       OBVHStackItem *ccl_restrict s2,
+                                       OBVHStackItem *ccl_restrict s3,
+                                       OBVHStackItem *ccl_restrict s4,
+                                       OBVHStackItem *ccl_restrict s5,
+                                       OBVHStackItem *ccl_restrict s6,
+                                       OBVHStackItem *ccl_restrict s7,
+                                       OBVHStackItem *ccl_restrict s8)
+{
+       obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7);
+       if(s8->dist < s7->dist) {
+               obvh_item_swap(s7, s8);
+               if(s7->dist < s6->dist) {
+                       obvh_item_swap(s6, s7);
+                       if(s6->dist < s5->dist) {
+                               obvh_item_swap(s5, s6);
+                               if(s5->dist < s4->dist) {
+                                       obvh_item_swap(s4, s5);
+                                       if(s4->dist < s3->dist) {
+                                               obvh_item_swap(s3, s4);
+                                               if(s3->dist < s2->dist) {
+                                                       obvh_item_swap(s2, s3);
+                                                       if(s2->dist < s1->dist) {
+                                                               obvh_item_swap(s1, s2);
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+}
+
+/* Axis-aligned nodes intersection */
+
+ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
+                                                  const avxf& isect_near,
+                                                  const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+                                                  const avx3f& org_idir,
+#else
+                                                  const avx3f& org,
+#endif
+                                                  const avx3f& idir,
+                                                  const int near_x,
+                                                  const int near_y,
+                                                  const int near_z,
+                                                  const int far_x,
+                                                  const int far_y,
+                                                  const int far_z,
+                                                  const int node_addr,
+                                                  avxf *ccl_restrict dist)
+{
+       const int offset = node_addr + 2;
+#ifdef __KERNEL_AVX2__
+       const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x);
+       const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y);
+       const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z);
+       const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x);
+       const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y);
+       const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z);
+
+       const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+       const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+       const avxb vmask = tnear <= tfar;
+       int mask = (int)movemask(vmask);
+       *dist = tnear;
+       return mask;
+#else
+       return 0;
+#endif
+}
+
+ccl_device_inline int obvh_aligned_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#else
+        const avx3f& P,
+#endif
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+       const int offset = node_addr + 2;
+#ifdef __KERNEL_AVX2__
+       const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x);
+       const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x);
+       const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y);
+       const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y);
+       const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z);
+       const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z);
+
+       const float round_down = 1.0f - difl;
+       const float round_up = 1.0f + difl;
+       const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
+       const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+       const avxb vmask = round_down*tnear <= round_up*tfar;
+       int mask = (int)movemask(vmask);
+       *dist = tnear;
+       return mask;
+#else
+       return 0;
+#endif
+}
+
+/* Unaligned nodes intersection */
+
+ccl_device_inline int obvh_unaligned_node_intersect(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& org_idir,
+#endif
+        const avx3f& org,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        avxf *ccl_restrict dist)
+{
+       const int offset = node_addr;
+       const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
+       const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
+       const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
+
+       const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
+       const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
+       const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
+
+       const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
+       const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
+       const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
+
+       const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
+       const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
+       const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
+
+       const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+                  aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+                  aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+       const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x,
+                  aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y,
+                  aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z;
+
+       const avxf neg_one(-1.0f);
+       const avxf nrdir_x = neg_one / aligned_dir_x,
+                  nrdir_y = neg_one / aligned_dir_y,
+                  nrdir_z = neg_one / aligned_dir_z;
+
+       const avxf tlower_x = aligned_P_x * nrdir_x,
+                  tlower_y = aligned_P_y * nrdir_y,
+                  tlower_z = aligned_P_z * nrdir_z;
+
+       const avxf tupper_x = tlower_x - nrdir_x,
+                  tupper_y = tlower_y - nrdir_y,
+                  tupper_z = tlower_z - nrdir_z;
+
+       const avxf tnear_x = min(tlower_x, tupper_x);
+       const avxf tnear_y = min(tlower_y, tupper_y);
+       const avxf tnear_z = min(tlower_z, tupper_z);
+       const avxf tfar_x = max(tlower_x, tupper_x);
+       const avxf tfar_y = max(tlower_y, tupper_y);
+       const avxf tfar_z = max(tlower_z, tupper_z);
+       const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+       const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+       const avxb vmask = tnear <= tfar;
+       *dist = tnear;
+       return movemask(vmask);
+}
+
+ccl_device_inline int obvh_unaligned_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#endif
+        const avx3f& P,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+       const int offset = node_addr;
+       const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2);
+       const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4);
+       const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6);
+
+       const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8);
+       const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10);
+       const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12);
+
+       const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14);
+       const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16);
+       const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18);
+
+       const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20);
+       const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22);
+       const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24);
+
+       const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z,
+                  aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z,
+                  aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z;
+
+       const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x,
+                  aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y,
+                  aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z;
+
+       const avxf neg_one(-1.0f);
+       const avxf nrdir_x = neg_one / aligned_dir_x,
+                  nrdir_y = neg_one / aligned_dir_y,
+                  nrdir_z = neg_one / aligned_dir_z;
+
+       const avxf tlower_x = aligned_P_x * nrdir_x,
+                  tlower_y = aligned_P_y * nrdir_y,
+                  tlower_z = aligned_P_z * nrdir_z;
+
+       const avxf tupper_x = tlower_x - nrdir_x,
+                  tupper_y = tlower_y - nrdir_y,
+                  tupper_z = tlower_z - nrdir_z;
+
+       const float round_down = 1.0f - difl;
+       const float round_up = 1.0f + difl;
+
+       const avxf tnear_x = min(tlower_x, tupper_x);
+       const avxf tnear_y = min(tlower_y, tupper_y);
+       const avxf tnear_z = min(tlower_z, tupper_z);
+       const avxf tfar_x = max(tlower_x, tupper_x);
+       const avxf tfar_y = max(tlower_y, tupper_y);
+       const avxf tfar_z = max(tlower_z, tupper_z);
+
+       const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+       const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
+       const avxb vmask = round_down*tnear <= round_up*tfar;
+       *dist = tnear;
+       return movemask(vmask);
+}
+
+/* Intersectors wrappers.
+ *
+ * They'll check node type and call appropriate intersection code.
+ */
+
+ccl_device_inline int obvh_node_intersect(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& org_idir,
+#endif
+        const avx3f& org,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        avxf *ccl_restrict dist)
+{
+       const int offset = node_addr;
+       const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+       if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+               return obvh_unaligned_node_intersect(kg,
+                                                    isect_near,
+                                                    isect_far,
+#ifdef __KERNEL_AVX2__
+                                                    org_idir,
+#endif
+                                                    org,
+                                                    dir,
+                                                    idir,
+                                                    near_x, near_y, near_z,
+                                                    far_x, far_y, far_z,
+                                                    node_addr,
+                                                    dist);
+       }
+       else {
+               return obvh_aligned_node_intersect(kg,
+                                                  isect_near,
+                                                  isect_far,
+#ifdef __KERNEL_AVX2__
+                                                  org_idir,
+#else
+                                                  org,
+#endif
+                                                  idir,
+                                                  near_x, near_y, near_z,
+                                                  far_x, far_y, far_z,
+                                                  node_addr,
+                                                  dist);
+       }
+}
+
+ccl_device_inline int obvh_node_intersect_robust(
+        KernelGlobals *ccl_restrict kg,
+        const avxf& isect_near,
+        const avxf& isect_far,
+#ifdef __KERNEL_AVX2__
+        const avx3f& P_idir,
+#endif
+        const avx3f& P,
+        const avx3f& dir,
+        const avx3f& idir,
+        const int near_x,
+        const int near_y,
+        const int near_z,
+        const int far_x,
+        const int far_y,
+        const int far_z,
+        const int node_addr,
+        const float difl,
+        avxf *ccl_restrict dist)
+{
+       const int offset = node_addr;
+       const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
+       if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
+               return obvh_unaligned_node_intersect_robust(kg,
+                                                           isect_near,
+                                                           isect_far,
+#ifdef __KERNEL_AVX2__
+                                                           P_idir,
+#endif
+                                                           P,
+                                                           dir,
+                                                           idir,
+                                                           near_x, near_y, near_z,
+                                                           far_x, far_y, far_z,
+                                                           node_addr,
+                                                           difl,
+                                                           dist);
+       }
+       else {
+               return obvh_aligned_node_intersect_robust(kg,
+                                                         isect_near,
+                                                         isect_far,
+#ifdef __KERNEL_AVX2__
+                                                         P_idir,
+#else
+                                                         P,
+#endif
+                                                         idir,
+                                                         near_x, near_y, near_z,
+                                                         far_x, far_y, far_z,
+                                                         node_addr,
+                                                         difl,
+                                                         dist);
+       }
+}
diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h
new file mode 100644 (file)
index 0000000..3e87706
--- /dev/null
@@ -0,0 +1,687 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const int skip_object,
+                                             const uint max_hits,
+                                             uint *num_hits)
+{
+       /* TODO(sergey):
+        *  - Test if pushing distance on the stack helps.
+        * - Likely and unlikely for if() statements.
+        * - Test restrict attribute for pointers.
+        */
+
+       /* Traversal stack in CUDA thread-local memory. */
+       OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+       traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+       /* Traversal variables in registers. */
+       int stack_ptr = 0;
+       int node_addr = kernel_data.bvh.root;
+
+       /* Ray parameters in registers. */
+       const float tmax = ray->t;
+       float3 P = ray->P;
+       float3 dir = bvh_clamp_direction(ray->D);
+       float3 idir = bvh_inverse_direction(dir);
+       int object = OBJECT_NONE;
+       float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+       Transform ob_itfm;
+#endif
+
+       *num_hits = 0;
+       isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+       if(!isfinite(P.x)) {
+               return false;
+       }
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+       int num_hits_in_instance = 0;
+#endif
+
+       avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+       avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+       avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+       float3 P_idir = P*idir;
+       avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+       avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+       /* Offsets to select the side that becomes the lower or upper bound. */
+       int near_x, near_y, near_z;
+       int far_x, far_y, far_z;
+       obvh_near_far_idx_calc(idir,
+                              &near_x, &near_y, &near_z,
+                              &far_x, &far_y, &far_z);
+
+       /* Traversal loop. */
+       do {
+               do {
+                       /* Traverse internal nodes. */
+                       while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+                               float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+                               (void)inodes;
+
+                               if(false
+#ifdef __VISIBILITY_FLAG__
+                                  || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
+#endif
+#if BVH_FEATURE(BVH_MOTION)
+                                  || UNLIKELY(ray->time < inodes.y)
+                                  || UNLIKELY(ray->time > inodes.z)
+#endif
+                               ) {
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+                                       continue;
+                               }
+
+                               avxf dist;
+                               int child_mask = NODE_INTERSECT(kg,
+                                                               tnear,
+                                                               tfar,
+#ifdef __KERNEL_AVX2__
+                                                                   P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+//#if !defined(__KERNEL_AVX2__)
+                                                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+                                                                   dir4,
+#endif
+                                                                   idir4,
+                                                                   near_x, near_y, near_z,
+                                                                   far_x, far_y, far_z,
+                                                                   node_addr,
+                                                                   &dist);
+
+                               if(child_mask != 0) {
+                                       avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+                                       if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+                                       }
+                                       else
+#endif
+                                       {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+                                       }
+
+                                       /* One child is hit, continue with that child. */
+                                       int r = __bscf(child_mask);
+                                       if(child_mask == 0) {
+                                               node_addr = __float_as_int(cnodes[r]);
+                                               continue;
+                                       }
+
+                                       /* Two children are hit, push far child, and continue with
+                                        * closer child.
+                                        */
+                                       int c0 = __float_as_int(cnodes[r]);
+                                       float d0 = ((float*)&dist)[r];
+                                       r = __bscf(child_mask);
+                                       int c1 = __float_as_int(cnodes[r]);
+                                       float d1 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               if(d1 < d0) {
+                                                       node_addr = c1;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c0;
+                                                       traversal_stack[stack_ptr].dist = d0;
+                                                       continue;
+                                               }
+                                               else {
+                                                       node_addr = c0;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c1;
+                                                       traversal_stack[stack_ptr].dist = d1;
+                                                       continue;
+                                               }
+                                       }
+
+                                       /* Here starts the slow path for 3 or 4 hit children. We push
+                                        * all nodes onto the stack to sort them there.
+                                        */
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c1;
+                                       traversal_stack[stack_ptr].dist = d1;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c0;
+                                       traversal_stack[stack_ptr].dist = d0;
+
+                                       /* Three children are hit, push all onto stack and sort 3
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c2 = __float_as_int(cnodes[r]);
+                                       float d2 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Four children are hit, push all onto stack and sort 4
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c3 = __float_as_int(cnodes[r]);
+                                       float d3 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c3;
+                                               traversal_stack[stack_ptr].dist = d3;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c3;
+                                       traversal_stack[stack_ptr].dist = d3;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c2;
+                                       traversal_stack[stack_ptr].dist = d2;
+
+                                       /* Five children are hit, push all onto stack and sort 5
+                                        * stack items, continue with closest child
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c4 = __float_as_int(cnodes[r]);
+                                       float d4 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Six children are hit, push all onto stack and sort 6
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c5 = __float_as_int(cnodes[r]);
+                                       float d5 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c5;
+                                               traversal_stack[stack_ptr].dist = d5;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c5;
+                                       traversal_stack[stack_ptr].dist = d5;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c4;
+                                       traversal_stack[stack_ptr].dist = d4;
+
+                                       /* Seven children are hit, push all onto stack and sort 7
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c6 = __float_as_int(cnodes[r]);
+                                       float d6 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c6;
+                                               traversal_stack[stack_ptr].dist = d6;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5],
+                                                               &traversal_stack[stack_ptr - 6]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Eight children are hit, push all onto stack and sort 8
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c7 = __float_as_int(cnodes[r]);
+                                       float d7 = ((float*)&dist)[r];
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c7;
+                                       traversal_stack[stack_ptr].dist = d7;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c6;
+                                       traversal_stack[stack_ptr].dist = d6;
+                                       obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                       &traversal_stack[stack_ptr - 1],
+                                                       &traversal_stack[stack_ptr - 2],
+                                                       &traversal_stack[stack_ptr - 3],
+                                                       &traversal_stack[stack_ptr - 4],
+                                                       &traversal_stack[stack_ptr - 5],
+                                                       &traversal_stack[stack_ptr - 6],
+                                                       &traversal_stack[stack_ptr - 7]);
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+                                       continue;
+                               }
+
+                               node_addr = traversal_stack[stack_ptr].addr;
+                               --stack_ptr;
+                       }
+
+                       /* If node is leaf, fetch triangle list. */
+                       if(node_addr < 0) {
+                               float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+#ifdef __VISIBILITY_FLAG__
+                               if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+                                       continue;
+                               }
+#endif
+
+                               int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+                               if(prim_addr >= 0) {
+#endif
+                                       int prim_addr2 = __float_as_int(leaf.y);
+                                       const uint type = __float_as_int(leaf.w);
+                                       const uint p_type = type & PRIMITIVE_ALL;
+
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+
+                                       /* Primitive intersection. */
+                                       if(p_type == PRIMITIVE_TRIANGLE) {
+                                               int prim_count = prim_addr2 - prim_addr;
+                                               if(prim_count < 3) {
+                                                       while(prim_addr < prim_addr2) {
+                                                               kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+                                                               int hit = triangle_intersect(kg,
+                                                                                            isect_array,
+                                                                                            P,
+                                                                                            dir,
+                                                                                            PATH_RAY_SHADOW,
+                                                                                            object,
+                                                                                            prim_addr);
+                                                               /* Shadow ray early termination. */
+                                                               if(hit) {
+                                                                       /* detect if this surface has a shader with transparent shadows */
+
+                                                                       /* todo: optimize so primitive visibility flag indicates if
+                                                                        * the primitive has a transparent shadow shader? */
+                                                                       int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+                                                                       int shader = 0;
+
+#ifdef __HAIR__
+                                                                       if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+                                                                       {
+                                                                               shader = kernel_tex_fetch(__tri_shader, prim);
+                                                                       }
+#ifdef __HAIR__
+                                                                       else {
+                                                                               float4 str = kernel_tex_fetch(__curves, prim);
+                                                                               shader = __float_as_int(str.z);
+                                                                       }
+#endif
+                                                                       int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+                                                                       /* if no transparent shadows, all light is blocked */
+                                                                       if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+                                                                               return true;
+                                                                       }
+                                                                       /* if maximum number of hits reached, block all light */
+                                                                       else if(*num_hits == max_hits) {
+                                                                               return true;
+                                                                       }
+
+                                                                       /* move on to next entry in intersections array */
+                                                                       isect_array++;
+                                                                       (*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+                                                                       num_hits_in_instance++;
+#endif
+
+                                                                       isect_array->t = isect_t;
+                                                               }
+
+                                                               prim_addr++;
+                                                       } //while
+                                       } else {
+                                                       kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == p_type);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+                                                       int* nhiptr = &num_hits_in_instance;
+#else
+                                                       int nhi= 0;
+                                                       int *nhiptr = &nhi;
+#endif
+
+                                                       int result = triangle_intersect8(kg,
+                                                                                        &isect_array,
+                                                                                        P,
+                                                                                        dir,
+                                                                                        PATH_RAY_SHADOW,
+                                                                                        object,
+                                                                                        prim_addr,
+                                                                                        prim_count,
+                                                                                        num_hits,
+                                                                                        max_hits,
+                                                                                        nhiptr,
+                                                                                        isect_t);
+                                                       if(result == 2) {
+                                                               return true;
+                                                       }
+                                               }   // prim_count
+                                       }  // PRIMITIVE_TRIANGLE
+                                       else {
+                                                       while(prim_addr < prim_addr2) {
+                                                       kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+
+#ifdef __SHADOW_TRICKS__
+                                                       uint tri_object = (object == OBJECT_NONE)
+                                                                       ? kernel_tex_fetch(__prim_object, prim_addr)
+                                                                       : object;
+                                                       if(tri_object == skip_object) {
+                                                               ++prim_addr;
+                                                               continue;
+                                                       }
+#endif
+
+                                                       bool hit;
+
+                                                       /* todo: specialized intersect functions which don't fill in
+                                                        * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+                                                        * might give a few % performance improvement */
+
+                                                       switch(p_type) {
+
+#if BVH_FEATURE(BVH_MOTION)
+                                                               case PRIMITIVE_MOTION_TRIANGLE: {
+                                                                       hit = motion_triangle_intersect(kg,
+                                                                                                       isect_array,
+                                                                                                       P,
+                                                                                                       dir,
+                                                                                                       ray->time,
+                                                                                                       PATH_RAY_SHADOW,
+                                                                                                       object,
+                                                                                                       prim_addr);
+                                                                       break;
+                                                               }
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+                                                               case PRIMITIVE_CURVE:
+                                                               case PRIMITIVE_MOTION_CURVE: {
+                                                                       const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+                                                                       if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+                                                                               hit = cardinal_curve_intersect(kg,
+                                                                                                              isect_array,
+                                                                                                              P,
+                                                                                                              dir,
+                                                                                                              PATH_RAY_SHADOW,
+                                                                                                              object,
+                                                                                                              prim_addr,
+                                                                                                              ray->time,
+                                                                                                              curve_type,
+                                                                                                              NULL,
+                                                                                                              0, 0);
+                                                                       }
+                                                                       else {
+                                                                               hit = curve_intersect(kg,
+                                                                                                     isect_array,
+                                                                                                     P,
+                                                                                                     dir,
+                                                                                                     PATH_RAY_SHADOW,
+                                                                                                     object,
+                                                                                                     prim_addr,
+                                                                                                     ray->time,
+                                                                                                     curve_type,
+                                                                                                     NULL,
+                                                                                                     0, 0);
+                                                                       }
+                                                                       break;
+                                                               }
+#endif
+                                                               default: {
+                                                                       hit = false;
+                                                                       break;
+                                                               }
+                                                       }
+
+                                                       /* Shadow ray early termination. */
+                                                       if(hit) {
+                                                               /* detect if this surface has a shader with transparent shadows */
+
+                                                               /* todo: optimize so primitive visibility flag indicates if
+                                                                * the primitive has a transparent shadow shader? */
+                                                               int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+                                                               int shader = 0;
+
+#ifdef __HAIR__
+                                                               if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+                                                               {
+                                                                       shader = kernel_tex_fetch(__tri_shader, prim);
+                                                               }
+#ifdef __HAIR__
+                                                               else {
+                                                                       float4 str = kernel_tex_fetch(__curves, prim);
+                                                                       shader = __float_as_int(str.z);
+                                                               }
+#endif
+                                                               int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+
+                                                               /* if no transparent shadows, all light is blocked */
+                                                               if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+                                                                       return true;
+                                                               }
+                                                               /* if maximum number of hits reached, block all light */
+                                                               else if(*num_hits == max_hits) {
+                                                                       return true;
+                                                               }
+
+                                                               /* move on to next entry in intersections array */
+                                                               isect_array++;
+                                                               (*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+                                                               num_hits_in_instance++;
+#endif
+
+                                                               isect_array->t = isect_t;
+                                                       }
+
+                                                       prim_addr++;
+                                               }//while prim
+                                       }
+                               }
+#if BVH_FEATURE(BVH_INSTANCING)
+                               else {
+                                       /* Instance push. */
+                                       object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+                                       isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+#  else
+                                       isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+#  endif
+
+                                       num_hits_in_instance = 0;
+                                       isect_array->t = isect_t;
+
+                                       obvh_near_far_idx_calc(idir,
+                                                              &near_x, &near_y, &near_z,
+                                                              &far_x, &far_y, &far_z);
+                                       tfar = avxf(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+                                       dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+                                       idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+                                       P_idir = P*idir;
+                                       P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                                       org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+                                       node_addr = kernel_tex_fetch(__object_node, object);
+
+                               }
+                       }
+#endif  /* FEATURE(BVH_INSTANCING) */
+               } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+               if(stack_ptr >= 0) {
+                       kernel_assert(object != OBJECT_NONE);
+
+                       /* Instance pop. */
+                       if(num_hits_in_instance) {
+                               float t_fac;
+#  if BVH_FEATURE(BVH_MOTION)
+                               bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+#  else
+                               bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#  endif
+                               /* Scale isect->t to adjust for instancing. */
+                               for(int i = 0; i < num_hits_in_instance; i++) {
+                                       (isect_array-i-1)->t *= t_fac;
+                               }
+                       }
+                       else {
+#  if BVH_FEATURE(BVH_MOTION)
+                               bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+#  else
+                               bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+#  endif
+                       }
+
+                       isect_t = tmax;
+                       isect_array->t = isect_t;
+
+                       obvh_near_far_idx_calc(idir,
+                                              &near_x, &near_y, &near_z,
+                                              &far_x, &far_y, &far_z);
+                       tfar = avxf(isect_t);
+#  if BVH_FEATURE(BVH_HAIR)
+                       dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+                       idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+                       P_idir = P*idir;
+                       P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                       org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+                       object = OBJECT_NONE;
+                       node_addr = traversal_stack[stack_ptr].addr;
+                       --stack_ptr;
+               }
+#endif  /* FEATURE(BVH_INSTANCING) */
+       } while(node_addr != ENTRYPOINT_SENTINEL);
+
+       return false;
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h
new file mode 100644 (file)
index 0000000..2021d8e
--- /dev/null
@@ -0,0 +1,642 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#  define NODE_INTERSECT_ROBUST obvh_node_intersect_robust
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#  define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                             ,uint *lcg_state,
+                                             float difl,
+                                             float extmax
+#endif
+                                             )
+{
+       /* Traversal stack in CUDA thread-local memory. */
+       OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+       traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+       traversal_stack[0].dist = -FLT_MAX;
+
+       /* Traversal variables in registers. */
+       int stack_ptr = 0;
+       int node_addr = kernel_data.bvh.root;
+       float node_dist = -FLT_MAX;
+
+       /* Ray parameters in registers. */
+       float3 P = ray->P;
+       float3 dir = bvh_clamp_direction(ray->D);
+       float3 idir = bvh_inverse_direction(dir);
+       int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+       Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+       if(!isfinite(P.x)) {
+               return false;
+       }
+#endif
+
+       isect->t = ray->t;
+       isect->u = 0.0f;
+       isect->v = 0.0f;
+       isect->prim = PRIM_NONE;
+       isect->object = OBJECT_NONE;
+
+       BVH_DEBUG_INIT();
+       avxf tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+       avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+       avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+       float3 P_idir = P*idir;
+       avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+       avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+       /* Offsets to select the side that becomes the lower or upper bound. */
+       int near_x, near_y, near_z;
+       int far_x, far_y, far_z;
+       obvh_near_far_idx_calc(idir,
+                              &near_x, &near_y, &near_z,
+                              &far_x, &far_y, &far_z);
+       /* Traversal loop. */
+       do {
+               do {
+                       /* Traverse internal nodes. */
+                       while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+                               float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+                               (void)inodes;
+
+                               if(UNLIKELY(node_dist > isect->t)
+#if BVH_FEATURE(BVH_MOTION)
+                                  || UNLIKELY(ray->time < inodes.y)
+                                  || UNLIKELY(ray->time > inodes.z)
+#endif
+#ifdef __VISIBILITY_FLAG__
+                                  || (__float_as_uint(inodes.x) & visibility) == 0
+#endif
+                                )
+                               {
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       node_dist = traversal_stack[stack_ptr].dist;
+                                       --stack_ptr;
+                                       continue;
+                               }
+
+                               int child_mask;
+                               avxf dist;
+
+                               BVH_DEBUG_NEXT_NODE();
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                               if(difl != 0.0f) {
+                                       /* NOTE: We extend all the child BB instead of fetching
+                                        * and checking visibility flags for each of the,
+                                        *
+                                        * Need to test if doing opposite would be any faster.
+                                        */
+                                       child_mask = NODE_INTERSECT_ROBUST(kg,
+                                                                          tnear,
+                                                                          tfar,
+#  ifdef __KERNEL_AVX2__
+                                                                          P_idir4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                                                                          org4,
+#  endif
+#  if BVH_FEATURE(BVH_HAIR)
+                                                                          dir4,
+#  endif
+                                                                          idir4,
+                                                                          near_x, near_y, near_z,
+                                                                          far_x, far_y, far_z,
+                                                                          node_addr,
+                                                                          difl,
+                                                                          &dist);
+                               }
+                               else
+#endif  /* BVH_HAIR_MINIMUM_WIDTH */
+                               {
+                                       child_mask = NODE_INTERSECT(kg,
+                                                                   tnear,
+                                                                   tfar,
+#ifdef __KERNEL_AVX2__
+                                                                   P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                                                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+                                                                   dir4,
+#endif
+                                                                   idir4,
+                                                                   near_x, near_y, near_z,
+                                                                   far_x, far_y, far_z,
+                                                                   node_addr,
+                                                                   &dist);
+                               }
+
+                               if(child_mask != 0) {
+                                       avxf cnodes;
+                                       /* TODO(sergey): Investigate whether moving cnodes upwards
+                                        * gives a speedup (will be different cache pattern but will
+                                        * avoid extra check here),
+                                        */
+#if BVH_FEATURE(BVH_HAIR)
+                                       if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+                                       }
+                                       else
+#endif
+                                       {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+                                       }
+
+                                       /* One child is hit, continue with that child. */
+                                       int r = __bscf(child_mask);
+                                       float d0 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               node_addr = __float_as_int(cnodes[r]);
+                                               node_dist = d0;
+                                               continue;
+                                       }
+
+                                       /* Two children are hit, push far child, and continue with
+                                        * closer child.
+                                        */
+                                       int c0 = __float_as_int(cnodes[r]);
+                                       r = __bscf(child_mask);
+                                       int c1 = __float_as_int(cnodes[r]);
+                                       float d1 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               if(d1 < d0) {
+                                                       node_addr = c1;
+                                                       node_dist = d1;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c0;
+                                                       traversal_stack[stack_ptr].dist = d0;
+                                                       continue;
+                                               }
+                                               else {
+                                                       node_addr = c0;
+                                                       node_dist = d0;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c1;
+                                                       traversal_stack[stack_ptr].dist = d1;
+                                                       continue;
+                                               }
+                                       }
+
+                                       /* Here starts the slow path for 3 or 4 hit children. We push
+                                        * all nodes onto the stack to sort them there.
+                                        */
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c1;
+                                       traversal_stack[stack_ptr].dist = d1;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c0;
+                                       traversal_stack[stack_ptr].dist = d0;
+
+                                       /* Three children are hit, push all onto stack and sort 3
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c2 = __float_as_int(cnodes[r]);
+                                       float d2 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               node_dist = traversal_stack[stack_ptr].dist;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Four children are hit, push all onto stack and sort 4
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c3 = __float_as_int(cnodes[r]);
+                                       float d3 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c3;
+                                               traversal_stack[stack_ptr].dist = d3;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               node_dist = traversal_stack[stack_ptr].dist;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c3;
+                                       traversal_stack[stack_ptr].dist = d3;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c2;
+                                       traversal_stack[stack_ptr].dist = d2;
+
+                                       /* Five children are hit, push all onto stack and sort 5
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c4 = __float_as_int(cnodes[r]);
+                                       float d4 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               node_dist = traversal_stack[stack_ptr].dist;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Six children are hit, push all onto stack and sort 6
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c5 = __float_as_int(cnodes[r]);
+                                       float d5 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c5;
+                                               traversal_stack[stack_ptr].dist = d5;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               node_dist = traversal_stack[stack_ptr].dist;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c5;
+                                       traversal_stack[stack_ptr].dist = d5;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c4;
+                                       traversal_stack[stack_ptr].dist = d4;
+
+                                       /* Seven children are hit, push all onto stack and sort 7
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c6 = __float_as_int(cnodes[r]);
+                                       float d6 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c6;
+                                               traversal_stack[stack_ptr].dist = d6;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5],
+                                                               &traversal_stack[stack_ptr - 6]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               node_dist = traversal_stack[stack_ptr].dist;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Eight children are hit, push all onto stack and sort 8
+                                       * stack items, continue with closest child.
+                                       */
+                                       r = __bscf(child_mask);
+                                       int c7 = __float_as_int(cnodes[r]);
+                                       float d7 = ((float*)&dist)[r];
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c7;
+                                       traversal_stack[stack_ptr].dist = d7;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c6;
+                                       traversal_stack[stack_ptr].dist = d6;
+                                       obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                       &traversal_stack[stack_ptr - 1],
+                                                       &traversal_stack[stack_ptr - 2],
+                                                       &traversal_stack[stack_ptr - 3],
+                                                       &traversal_stack[stack_ptr - 4],
+                                                       &traversal_stack[stack_ptr - 5],
+                                                       &traversal_stack[stack_ptr - 6],
+                                                       &traversal_stack[stack_ptr - 7]);
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       node_dist = traversal_stack[stack_ptr].dist;
+                                       --stack_ptr;
+                                       continue;
+                               }
+
+
+                               node_addr = traversal_stack[stack_ptr].addr;
+                               node_dist = traversal_stack[stack_ptr].dist;
+                               --stack_ptr;
+                       }
+
+                       /* If node is leaf, fetch triangle list. */
+                       if(node_addr < 0) {
+                               float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+#ifdef __VISIBILITY_FLAG__
+                               if(UNLIKELY((node_dist > isect->t) ||
+                                           ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+                               if(UNLIKELY((node_dist > isect->t)))
+#endif
+                               {
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       node_dist = traversal_stack[stack_ptr].dist;
+                                       --stack_ptr;
+                                       continue;
+                               }
+                               int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+                               if(prim_addr >= 0) {
+#endif
+                                       int prim_addr2 = __float_as_int(leaf.y);
+                                       const uint type = __float_as_int(leaf.w);
+
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       node_dist = traversal_stack[stack_ptr].dist;
+                                       --stack_ptr;
+
+                                       /* Primitive intersection. */
+                                       switch(type & PRIMITIVE_ALL) {
+                                       case PRIMITIVE_TRIANGLE: {
+                                               int prim_count = prim_addr2 - prim_addr;
+                                               if(prim_count < 3) {
+                                                       for(; prim_addr < prim_addr2; prim_addr++) {
+                                                                BVH_DEBUG_NEXT_INTERSECTION();
+                                                                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                                                                if(triangle_intersect(kg,
+                                                                                      isect,
+                                                                                      P,
+                                                                                      dir,
+                                                                                      visibility,
+                                                                                      object,
+                                                                                      prim_addr))
+                                                                {
+                                                                        tfar = avxf(isect->t);
+                                                                        /* Shadow ray early termination. */
+                                                                        if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+                                                                                return true;
+                                                                        }
+                                                                }
+                                                       }//for
+                                               }
+                                               else {
+                                                       kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                                                       if(triangle_intersect8(kg,
+                                                                              &isect,
+                                                                              P,
+                                                                              dir,
+                                                                              visibility,
+                                                                              object,
+                                                                              prim_addr,
+                                                                              prim_count,
+                                                                              0,
+                                                                              0,
+                                                                              NULL,
+                                                                              0.0f))
+                                                       {
+                                                               tfar = avxf(isect->t);
+                                                               if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+                                                                       return true;
+                                                               }
+                                                       }
+                                               }//prim count
+                                               break;
+                                       }
+#if BVH_FEATURE(BVH_MOTION)
+                                               case PRIMITIVE_MOTION_TRIANGLE: {
+                                                       for(; prim_addr < prim_addr2; prim_addr++) {
+                                                               BVH_DEBUG_NEXT_INTERSECTION();
+                                                               kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                                                               if(motion_triangle_intersect(kg,
+                                                                                            isect,
+                                                                                            P,
+                                                                                            dir,
+                                                                                            ray->time,
+                                                                                            visibility,
+                                                                                            object,
+                                                                                            prim_addr))
+                                                               {
+                                                                       tfar = avxf(isect->t);
+                                                                       /* Shadow ray early termination. */
+                                                                       if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+                                                                               return true;
+                                                                       }
+                                                               }
+                                                       }
+                                                       break;
+                                               }
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+                                               case PRIMITIVE_CURVE:
+                                               case PRIMITIVE_MOTION_CURVE: {
+                                                       for(; prim_addr < prim_addr2; prim_addr++) {
+                                                               BVH_DEBUG_NEXT_INTERSECTION();
+                                                               const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+                                                               kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
+                                                               bool hit;
+                                                               if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
+                                                                       hit = cardinal_curve_intersect(kg,
+                                                                                                      isect,
+                                                                                                      P,
+                                                                                                      dir,
+                                                                                                      visibility,
+                                                                                                      object,
+                                                                                                      prim_addr,
+                                                                                                      ray->time,
+                                                                                                      curve_type,
+                                                                                                      lcg_state,
+                                                                                                      difl,
+                                                                                                      extmax);
+                                                               }
+                                                               else {
+                                                                       hit = curve_intersect(kg,
+                                                                                             isect,
+                                                                                             P,
+                                                                                             dir,
+                                                                                             visibility,
+                                                                                             object,
+                                                                                             prim_addr,
+                                                                                             ray->time,
+                                                                                             curve_type,
+                                                                                             lcg_state,
+                                                                                             difl,
+                                                                                             extmax);
+                                                               }
+                                                               if(hit) {
+                                                                       tfar = avxf(isect->t);
+                                                                       /* Shadow ray early termination. */
+                                                                       if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+                                                                               return true;
+                                                                       }
+                                                               }
+                                                       }
+                                                       break;
+                                               }
+#endif  /* BVH_FEATURE(BVH_HAIR) */
+                                       }
+                               }
+#if BVH_FEATURE(BVH_INSTANCING)
+                               else {
+                                       /* Instance push. */
+                                       object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+
+#  if BVH_FEATURE(BVH_MOTION)
+                                       qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
+#  else
+                                       qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
+#  endif
+
+                                       obvh_near_far_idx_calc(idir,
+                                                              &near_x, &near_y, &near_z,
+                                                              &far_x, &far_y, &far_z);
+                                       tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+                                       dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+                                       idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+                                       P_idir = P*idir;
+                                       P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                                       org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+                                       traversal_stack[stack_ptr].dist = -FLT_MAX;
+
+                                       node_addr = kernel_tex_fetch(__object_node, object);
+
+                                       BVH_DEBUG_NEXT_INSTANCE();
+                               }
+                       }
+#endif  /* FEATURE(BVH_INSTANCING) */
+               } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+               if(stack_ptr >= 0) {
+                       kernel_assert(object != OBJECT_NONE);
+
+                       /* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+                       isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+                       isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+                       obvh_near_far_idx_calc(idir,
+                                              &near_x, &near_y, &near_z,
+                                              &far_x, &far_y, &far_z);
+                       tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+                       dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+                       idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+                       P_idir = P*idir;
+                       P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                       org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+                       object = OBJECT_NONE;
+                       node_addr = traversal_stack[stack_ptr].addr;
+                       node_dist = traversal_stack[stack_ptr].dist;
+                       --stack_ptr;
+               }
+#endif  /* FEATURE(BVH_INSTANCING) */
+       } while(node_addr != ENTRYPOINT_SENTINEL);
+
+       return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
+#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h
new file mode 100644 (file)
index 0000000..da9ddbd
--- /dev/null
@@ -0,0 +1,483 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility)
+{
+       /* Traversal stack in CUDA thread-local memory. */
+       OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+       traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+       /* Traversal variables in registers. */
+       int stack_ptr = 0;
+       int node_addr = kernel_data.bvh.root;
+
+       /* Ray parameters in registers. */
+       float3 P = ray->P;
+       float3 dir = bvh_clamp_direction(ray->D);
+       float3 idir = bvh_inverse_direction(dir);
+       int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+       Transform ob_itfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+       if(!isfinite(P.x)) {
+               return false;
+       }
+#endif
+
+       isect->t = ray->t;
+       isect->u = 0.0f;
+       isect->v = 0.0f;
+       isect->prim = PRIM_NONE;
+       isect->object = OBJECT_NONE;
+
+       avxf tnear(0.0f), tfar(ray->t);
+#if BVH_FEATURE(BVH_HAIR)
+       avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+       avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+       float3 P_idir = P*idir;
+       avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+       avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+       /* Offsets to select the side that becomes the lower or upper bound. */
+       int near_x, near_y, near_z;
+       int far_x, far_y, far_z;
+       obvh_near_far_idx_calc(idir,
+                              &near_x, &near_y, &near_z,
+                              &far_x, &far_y, &far_z);
+
+       /* Traversal loop. */
+       do {
+               do {
+                       /* Traverse internal nodes. */
+                       while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+                               float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+                               if((__float_as_uint(inodes.x) & visibility) == 0) {
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+                                       continue;
+                               }
+#endif
+
+                               avxf dist;
+                               int child_mask = NODE_INTERSECT(kg,
+                                                               tnear,
+                                                               tfar,
+#ifdef __KERNEL_AVX2__
+                                                               P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                                                               org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+                                                               dir4,
+#endif
+                                                               idir4,
+                                                               near_x, near_y, near_z,
+                                                               far_x, far_y, far_z,
+                                                               node_addr,
+                                                               &dist);
+
+                               if(child_mask != 0) {
+                                       avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+                                       if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+                                       }
+                                       else
+#endif
+                                       {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+                                       }
+
+                                       /* One child is hit, continue with that child. */
+                                       int r = __bscf(child_mask);
+                                       if(child_mask == 0) {
+                                               node_addr = __float_as_int(cnodes[r]);
+                                               continue;
+                                       }
+
+                                       /* Two children are hit, push far child, and continue with
+                                        * closer child.
+                                        */
+                                       int c0 = __float_as_int(cnodes[r]);
+                                       float d0 = ((float*)&dist)[r];
+                                       r = __bscf(child_mask);
+                                       int c1 = __float_as_int(cnodes[r]);
+                                       float d1 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               if(d1 < d0) {
+                                                       node_addr = c1;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c0;
+                                                       traversal_stack[stack_ptr].dist = d0;
+                                                       continue;
+                                               }
+                                               else {
+                                                       node_addr = c0;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c1;
+                                                       traversal_stack[stack_ptr].dist = d1;
+                                                       continue;
+                                               }
+                                       }
+
+                                       /* Here starts the slow path for 3 or 4 hit children. We push
+                                        * all nodes onto the stack to sort them there.
+                                        */
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c1;
+                                       traversal_stack[stack_ptr].dist = d1;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c0;
+                                       traversal_stack[stack_ptr].dist = d0;
+
+                                       /* Three children are hit, push all onto stack and sort 3
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c2 = __float_as_int(cnodes[r]);
+                                       float d2 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Four children are hit, push all onto stack and sort 4
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c3 = __float_as_int(cnodes[r]);
+                                       float d3 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c3;
+                                               traversal_stack[stack_ptr].dist = d3;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c3;
+                                       traversal_stack[stack_ptr].dist = d3;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c2;
+                                       traversal_stack[stack_ptr].dist = d2;
+
+                                       /* Five children are hit, push all onto stack and sort 5
+                                        * stack items, continue with closest child
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c4 = __float_as_int(cnodes[r]);
+                                       float d4 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Six children are hit, push all onto stack and sort 6
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c5 = __float_as_int(cnodes[r]);
+                                       float d5 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c5;
+                                               traversal_stack[stack_ptr].dist = d5;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c5;
+                                       traversal_stack[stack_ptr].dist = d5;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c4;
+                                       traversal_stack[stack_ptr].dist = d4;
+
+                                       /* Seven children are hit, push all onto stack and sort 7
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c6 = __float_as_int(cnodes[r]);
+                                       float d6 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c6;
+                                               traversal_stack[stack_ptr].dist = d6;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5],
+                                                               &traversal_stack[stack_ptr - 6]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Eight children are hit, push all onto stack and sort 8
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c7 = __float_as_int(cnodes[r]);
+                                       float d7 = ((float*)&dist)[r];
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c7;
+                                       traversal_stack[stack_ptr].dist = d7;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c6;
+                                       traversal_stack[stack_ptr].dist = d6;
+                                       obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                       &traversal_stack[stack_ptr - 1],
+                                                       &traversal_stack[stack_ptr - 2],
+                                                       &traversal_stack[stack_ptr - 3],
+                                                       &traversal_stack[stack_ptr - 4],
+                                                       &traversal_stack[stack_ptr - 5],
+                                                       &traversal_stack[stack_ptr - 6],
+                                                       &traversal_stack[stack_ptr - 7]);
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+                                       continue;
+                               }
+
+                               node_addr = traversal_stack[stack_ptr].addr;
+                               --stack_ptr;
+                       }
+
+                       /* If node is leaf, fetch triangle list. */
+                       if(node_addr < 0) {
+                               float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
+
+                               if((__float_as_uint(leaf.z) & visibility) == 0) {
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+                                       continue;
+                               }
+
+                               int prim_addr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+                               if(prim_addr >= 0) {
+#endif
+                                       int prim_addr2 = __float_as_int(leaf.y);
+                                       const uint type = __float_as_int(leaf.w);
+                                       const uint p_type = type & PRIMITIVE_ALL;
+
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+
+                                       /* Primitive intersection. */
+                                       switch(p_type) {
+                                               case PRIMITIVE_TRIANGLE: {
+                                                       for(; prim_addr < prim_addr2; prim_addr++) {
+                                                               kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                                                               /* Only primitives from volume object. */
+                                                               uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+                                                               int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+                                                               if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+                                                                       continue;
+                                                               }
+                                                               /* Intersect ray against primitive. */
+                                                               triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+                                                       }
+                                                       break;
+                                               }
+#if BVH_FEATURE(BVH_MOTION)
+                                               case PRIMITIVE_MOTION_TRIANGLE: {
+                                                       for(; prim_addr < prim_addr2; prim_addr++) {
+                                                               kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                                                               /* Only primitives from volume object. */
+                                                               uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object;
+                                                               int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+                                                               if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+                                                                       continue;
+                                                               }
+                                                               /* Intersect ray against primitive. */
+                                                               motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+                                                       }
+                                                       break;
+                                               }
+#endif
+                                       }
+                               }
+#if BVH_FEATURE(BVH_INSTANCING)
+                               else {
+                                       /* Instance push. */
+                                       object = kernel_tex_fetch(__prim_object, -prim_addr-1);
+                                       int object_flag = kernel_tex_fetch(__object_flag, object);
+                                       if(object_flag & SD_OBJECT_HAS_VOLUME) {
+#  if BVH_FEATURE(BVH_MOTION)
+                                               isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+                                               isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+                                               obvh_near_far_idx_calc(idir,
+                                                                      &near_x, &near_y, &near_z,
+                                                                      &far_x, &far_y, &far_z);
+                                               tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+                                               dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+                                               idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+                                               P_idir = P*idir;
+                                               P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                                               org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
+
+                                               node_addr = kernel_tex_fetch(__object_node, object);
+                                       }
+                                       else {
+                                               /* Pop. */
+                                               object = OBJECT_NONE;
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                       }
+                               }
+                       }
+#endif  /* FEATURE(BVH_INSTANCING) */
+               } while(node_addr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+               if(stack_ptr >= 0) {
+                       kernel_assert(object != OBJECT_NONE);
+
+                       /* Instance pop. */
+#  if BVH_FEATURE(BVH_MOTION)
+                       isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+#  else
+                       isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+#  endif
+
+                       obvh_near_far_idx_calc(idir,
+                                              &near_x, &near_y, &near_z,
+                                              &far_x, &far_y, &far_z);
+                       tfar = avxf(isect->t);
+#  if BVH_FEATURE(BVH_HAIR)
+                       dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#  endif
+                       idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+#  ifdef __KERNEL_AVX2__
+                       P_idir = P*idir;
+                       P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
+#  endif
+#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                       org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
+#  endif
+
+                       object = OBJECT_NONE;
+                       node_addr = traversal_stack[stack_ptr].addr;
+                       --stack_ptr;
+               }
+#endif  /* FEATURE(BVH_INSTANCING) */
+       } while(node_addr != ENTRYPOINT_SENTINEL);
+
+       return (isect->prim != PRIM_NONE);
+}
+
+#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h
new file mode 100644 (file)
index 0000000..a88573e
--- /dev/null
@@ -0,0 +1,554 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#if BVH_FEATURE(BVH_HAIR)
+#  define NODE_INTERSECT obvh_node_intersect
+#else
+#  define NODE_INTERSECT obvh_aligned_node_intersect
+#endif
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits,
+                                             const uint visibility)
+{
+       /* Traversal stack in CUDA thread-local memory. */
+       OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
+       traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
+
+       /* Traversal variables in registers. */
+       int stack_ptr = 0;
+       int node_addr = kernel_data.bvh.root;
+
+       /* Ray parameters in registers. */
+       const float tmax = ray->t;
+       float3 P = ray->P;
+       float3 dir = bvh_clamp_direction(ray->D);
+       float3 idir = bvh_inverse_direction(dir);
+       int object = OBJECT_NONE;
+       float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+       Transform ob_itfm;
+#endif
+
+       uint num_hits = 0;
+       isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+       if(!isfinite(P.x)) {
+               return 0;
+       }
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+       int num_hits_in_instance = 0;
+#endif
+
+       avxf tnear(0.0f), tfar(isect_t);
+#if BVH_FEATURE(BVH_HAIR)
+       avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
+#endif
+       avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
+
+#ifdef __KERNEL_AVX2__
+       float3 P_idir = P*idir;
+       avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+       avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
+#endif
+
+       /* Offsets to select the side that becomes the lower or upper bound. */
+       int near_x, near_y, near_z;
+       int far_x, far_y, far_z;
+       obvh_near_far_idx_calc(idir,
+                              &near_x, &near_y, &near_z,
+                              &far_x, &far_y, &far_z);
+
+       /* Traversal loop. */
+       do {
+               do {
+                       /* Traverse internal nodes. */
+                       while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
+                               float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
+                               if((__float_as_uint(inodes.x) & visibility) == 0) {
+                                       /* Pop. */
+                                       node_addr = traversal_stack[stack_ptr].addr;
+                                       --stack_ptr;
+                                       continue;
+                               }
+#endif
+
+                               avxf dist;
+                               int child_mask = NODE_INTERSECT(kg,
+                                                                   tnear,
+                                                                   tfar,
+#ifdef __KERNEL_AVX2__
+                                                                   P_idir4,
+#endif
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+                                                                   org4,
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+                                                                   dir4,
+#endif
+                                                                   idir4,
+                                                                   near_x, near_y, near_z,
+                                                                   far_x, far_y, far_z,
+                                                                   node_addr,
+                                                                   &dist);
+
+                               if(child_mask != 0) {
+                                       avxf cnodes;
+#if BVH_FEATURE(BVH_HAIR)
+                                       if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
+                                       }
+                                       else
+#endif
+                                       {
+                                               cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
+                                       }
+
+                                       /* One child is hit, continue with that child. */
+                                       int r = __bscf(child_mask);
+                                       if(child_mask == 0) {
+                                               node_addr = __float_as_int(cnodes[r]);
+                                               continue;
+                                       }
+
+                                       /* Two children are hit, push far child, and continue with
+                                        * closer child.
+                                        */
+                                       int c0 = __float_as_int(cnodes[r]);
+                                       float d0 = ((float*)&dist)[r];
+                                       r = __bscf(child_mask);
+                                       int c1 = __float_as_int(cnodes[r]);
+                                       float d1 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               if(d1 < d0) {
+                                                       node_addr = c1;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c0;
+                                                       traversal_stack[stack_ptr].dist = d0;
+                                                       continue;
+                                               }
+                                               else {
+                                                       node_addr = c0;
+                                                       ++stack_ptr;
+                                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                                       traversal_stack[stack_ptr].addr = c1;
+                                                       traversal_stack[stack_ptr].dist = d1;
+                                                       continue;
+                                               }
+                                       }
+
+                                       /* Here starts the slow path for 3 or 4 hit children. We push
+                                        * all nodes onto the stack to sort them there.
+                                        */
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c1;
+                                       traversal_stack[stack_ptr].dist = d1;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c0;
+                                       traversal_stack[stack_ptr].dist = d0;
+
+                                       /* Three children are hit, push all onto stack and sort 3
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c2 = __float_as_int(cnodes[r]);
+                                       float d2 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Four children are hit, push all onto stack and sort 4
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c3 = __float_as_int(cnodes[r]);
+                                       float d3 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c3;
+                                               traversal_stack[stack_ptr].dist = d3;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c2;
+                                               traversal_stack[stack_ptr].dist = d2;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c3;
+                                       traversal_stack[stack_ptr].dist = d3;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c2;
+                                       traversal_stack[stack_ptr].dist = d2;
+
+                                       /* Five children are hit, push all onto stack and sort 5
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c4 = __float_as_int(cnodes[r]);
+                                       float d4 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Six children are hit, push all onto stack and sort 6
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c5 = __float_as_int(cnodes[r]);
+                                       float d5 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c5;
+                                               traversal_stack[stack_ptr].dist = d5;
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c4;
+                                               traversal_stack[stack_ptr].dist = d4;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c5;
+                                       traversal_stack[stack_ptr].dist = d5;
+                                       ++stack_ptr;
+                                       kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                       traversal_stack[stack_ptr].addr = c4;
+                                       traversal_stack[stack_ptr].dist = d4;
+
+                                       /* Seven children are hit, push all onto stack and sort 7
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c6 = __float_as_int(cnodes[r]);
+                                       float d6 = ((float*)&dist)[r];
+                                       if(child_mask == 0) {
+                                               ++stack_ptr;
+                                               kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
+                                               traversal_stack[stack_ptr].addr = c6;
+                                               traversal_stack[stack_ptr].dist = d6;
+                                               obvh_stack_sort(&traversal_stack[stack_ptr],
+                                                               &traversal_stack[stack_ptr - 1],
+                                                               &traversal_stack[stack_ptr - 2],
+                                                               &traversal_stack[stack_ptr - 3],
+                                                               &traversal_stack[stack_ptr - 4],
+                                                               &traversal_stack[stack_ptr - 5],
+                                                               &traversal_stack[stack_ptr - 6]);
+                                               node_addr = traversal_stack[stack_ptr].addr;
+                                               --stack_ptr;
+                                               continue;
+                                       }
+
+                                       /* Eight children are hit, push all onto stack and sort 8
+                                        * stack items, continue with closest child.
+                                        */
+                                       r = __bscf(child_mask);
+                                       int c7 = __float_as_int(cnodes[r]);
+                                       float d7 = ((float*)&dist)[r];
+                                       ++stack_ptr;
+              &nb