Cycles: Support visibility check for inner nodes of QBVH
authorSergey Sharybin <sergey.vfx@gmail.com>
Fri, 10 Jun 2016 08:14:05 +0000 (10:14 +0200)
committerSergey Sharybin <sergey.vfx@gmail.com>
Thu, 7 Jul 2016 15:25:48 +0000 (17:25 +0200)
It was initially unsupported because initial idea of checking visibility
of all children was slowing scenes down a lot. Now the idea has changed
and we only perform visibility check of current node. This avoids huge
slowdown (from tests here it seems to be withing 1-2%, but more tests
would never hurt) and gives nice speedup of ray traversal for complex
scenes which utilized ray visibility.

Here's timing of koro.blend:

                  Without visibility check         With visibility check
Original file           4min 20sec                      4min 23sec
Camera rays only        1min 43 sec                       55sec

Unfortunately, this doesn't come for free and requires extra data in
BVH node, which increases memory usage of BVH nodes by 15%. This we
can solve with some future trickery of avoiding __tri_storage created
for curve segments.

intern/cycles/bvh/bvh.cpp
intern/cycles/bvh/bvh.h
intern/cycles/kernel/geom/geom.h
intern/cycles/kernel/geom/geom_qbvh.h
intern/cycles/kernel/geom/geom_qbvh_shadow.h
intern/cycles/kernel/geom/geom_qbvh_subsurface.h
intern/cycles/kernel/geom/geom_qbvh_traversal.h
intern/cycles/kernel/geom/geom_qbvh_volume.h
intern/cycles/kernel/geom/geom_qbvh_volume_all.h

index fa2b9ae7279c9f746b613894345fc675ba48bc72..311d7296c65573e34f02cb272cd9520dcb562de1 100644 (file)
@@ -338,12 +338,14 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
                        /* For QBVH we're packing a child bbox into 6 float4,
                         * and for regular BVH they're packed into 3 float4.
                         */
-                       size_t nsize_bbox = (use_qbvh)? 6: 3;
+                       const size_t nsize_bbox = (use_qbvh)? 7: 3;
                        int4 *bvh_nodes = &bvh->pack.nodes[0];
-                       size_t bvh_nodes_size = bvh->pack.nodes.size(); 
+                       size_t bvh_nodes_size = bvh->pack.nodes.size();
 
                        for(size_t i = 0, j = 0; i < bvh_nodes_size; i+=nsize, j++) {
-                               memcpy(pack_nodes + pack_nodes_offset, bvh_nodes + i, nsize_bbox*sizeof(int4));
+                               memcpy(pack_nodes + pack_nodes_offset,
+                                      bvh_nodes + i,
+                                      nsize_bbox*sizeof(int4));
 
                                /* modify offsets into arrays */
                                int4 data = bvh_nodes[i + nsize_bbox];
@@ -617,34 +619,35 @@ void QBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num)
 {
        float4 data[BVH_QNODE_SIZE];
 
+       data[0].x = __uint_as_float(e.node->m_visibility);
        for(int i = 0; i < num; i++) {
                float3 bb_min = en[i].node->m_bounds.min;
                float3 bb_max = en[i].node->m_bounds.max;
 
-               data[0][i] = bb_min.x;
-               data[1][i] = bb_max.x;
-               data[2][i] = bb_min.y;
-               data[3][i] = bb_max.y;
-               data[4][i] = bb_min.z;
-               data[5][i] = bb_max.z;
+               data[1][i] = bb_min.x;
+               data[2][i] = bb_max.x;
+               data[3][i] = bb_min.y;
+               data[4][i] = bb_max.y;
+               data[5][i] = bb_min.z;
+               data[6][i] = bb_max.z;
 
-               data[6][i] = __int_as_float(en[i].encodeIdx());
+               data[7][i] = __int_as_float(en[i].encodeIdx());
        }
 
        for(int i = num; i < 4; i++) {
                /* We store BB which would never be recorded as intersection
                 * so kernel might safely assume there are always 4 child nodes.
                 */
-               data[0][i] = FLT_MAX;
-               data[1][i] = -FLT_MAX;
+               data[1][i] = FLT_MAX;
+               data[2][i] = -FLT_MAX;
 
-               data[2][i] = FLT_MAX;
-               data[3][i] = -FLT_MAX;
+               data[3][i] = FLT_MAX;
+               data[4][i] = -FLT_MAX;
 
-               data[4][i] = FLT_MAX;
-               data[5][i] = -FLT_MAX;
+               data[5][i] = FLT_MAX;
+               data[6][i] = -FLT_MAX;
 
-               data[6][i] = __int_as_float(0);
+               data[7][i] = __int_as_float(0);
        }
 
        memcpy(&pack.nodes[e.idx * BVH_QNODE_SIZE], data, sizeof(float4)*BVH_QNODE_SIZE);
@@ -839,7 +842,7 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
        }
        else {
                int4 *data = &pack.nodes[idx*BVH_QNODE_SIZE];
-               int4 c = data[6];
+               int4 c = data[7];
                /* Refit inner node, set bbox from children. */
                BoundBox child_bbox[4] = {BoundBox::empty,
                                          BoundBox::empty,
@@ -859,16 +862,20 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
                }
 
                float4 inner_data[BVH_QNODE_SIZE];
+               inner_data[0] = make_float4(__int_as_float(data[0].x),
+                                           __int_as_float(data[1].y),
+                                           __int_as_float(data[2].z),
+                                           __int_as_float(data[3].w));
                for(int i = 0; i < 4; ++i) {
                        float3 bb_min = child_bbox[i].min;
                        float3 bb_max = child_bbox[i].max;
-                       inner_data[0][i] = bb_min.x;
-                       inner_data[1][i] = bb_max.x;
-                       inner_data[2][i] = bb_min.y;
-                       inner_data[3][i] = bb_max.y;
-                       inner_data[4][i] = bb_min.z;
-                       inner_data[5][i] = bb_max.z;
-                       inner_data[6][i] = __int_as_float(c[i]);
+                       inner_data[1][i] = bb_min.x;
+                       inner_data[2][i] = bb_max.x;
+                       inner_data[3][i] = bb_min.y;
+                       inner_data[4][i] = bb_max.y;
+                       inner_data[5][i] = bb_min.z;
+                       inner_data[6][i] = bb_max.z;
+                       inner_data[7][i] = __int_as_float(c[i]);
                }
                memcpy(&pack.nodes[idx * BVH_QNODE_SIZE],
                       inner_data,
index 6076c25ca3128e635bd4c586a8aebcc8c3c62811..b1655f0c5d05463a63f48dd0fbe614d760c58c68 100644 (file)
@@ -35,7 +35,7 @@ class Progress;
 
 #define BVH_NODE_SIZE  4
 #define BVH_NODE_LEAF_SIZE     1
-#define BVH_QNODE_SIZE 7
+#define BVH_QNODE_SIZE 8
 #define BVH_QNODE_LEAF_SIZE    1
 #define BVH_ALIGN              4096
 #define TRI_NODE_SIZE  3
index 2949f66c2ae4753a8ad97da6f7095f1a9ae463d4..26e0af99bbce3b800149df0f6595e0f9a99c847e 100644 (file)
@@ -23,7 +23,7 @@
 #define BVH_QSTACK_SIZE 384
 #define BVH_NODE_SIZE 4
 #define BVH_NODE_LEAF_SIZE 1
-#define BVH_QNODE_SIZE 7
+#define BVH_QNODE_SIZE 8
 #define BVH_QNODE_LEAF_SIZE 1
 #define TRI_NODE_SIZE 3
 
index 2a2d7822eee3107a52363691ae423a2c2654fdc6..118bceb4bec4594a80c85527a145f096cd3f6cbb 100644 (file)
@@ -69,7 +69,7 @@ ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
                                           const int nodeAddr,
                                           ssef *__restrict dist)
 {
-       const int offset = nodeAddr*BVH_QNODE_SIZE;
+       const int offset = nodeAddr*BVH_QNODE_SIZE + 1;
 #ifdef __KERNEL_AVX2__
        const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
        const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
@@ -120,7 +120,7 @@ ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
                                                  const float difl,
                                                  ssef *__restrict dist)
 {
-       const int offset = nodeAddr*BVH_QNODE_SIZE;
+       const int offset = nodeAddr*BVH_QNODE_SIZE + 1;
 #ifdef __KERNEL_AVX2__
        const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
        const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
index edb5b5c78c35c61088efede87e564f477be1fd7e..01babba021507a9c6e3360662cc0bbc6877606fd 100644 (file)
@@ -97,6 +97,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                do {
                        /* Traverse internal nodes. */
                        while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+                               float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+0);
+
+#ifdef __VISIBILITY_FLAG__
+                               if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) {
+                                       /* Pop. */
+                                       nodeAddr = traversalStack[stackPtr].addr;
+                                       --stackPtr;
+                                       continue;
+                               }
+#endif
+
                                ssef dist;
                                int traverseChild = qbvh_node_intersect(kg,
                                                                        tnear,
@@ -113,7 +124,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                                                        &dist);
 
                                if(traverseChild != 0) {
-                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes,
+                                                                        nodeAddr*BVH_QNODE_SIZE+7);
 
                                        /* One child is hit, continue with that child. */
                                        int r = __bscf(traverseChild);
index 84512a8783cf1948eb9c20c349ba06dda87a7d1e..bc717a0747ed321e0e3435cf85aabcc7cd19ac5c 100644 (file)
@@ -123,7 +123,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                                                        &dist);
 
                                if(traverseChild != 0) {
-                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes,
+                                                                        nodeAddr*BVH_QNODE_SIZE+7);
 
                                        /* One child is hit, continue with that child. */
                                        int r = __bscf(traverseChild);
index 738d08ac6fc9a17bc8795ca84aa1ebb036f09de7..098881a6d7e436a63e61c68227c8a03a6bd93654 100644 (file)
@@ -106,7 +106,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                do {
                        /* Traverse internal nodes. */
                        while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
-                               if(UNLIKELY(nodeDist > isect->t)) {
+                               float4 inodes = kernel_tex_fetch(__bvh_nodes, 
+                                                                nodeAddr*BVH_QNODE_SIZE+0);
+
+                               if(UNLIKELY(nodeDist > isect->t)
+#ifdef __VISIBILITY_FLAG__
+                                  || (__float_as_uint(inodes.x) & visibility) == 0)
+#endif
+                               {
                                        /* Pop. */
                                        nodeAddr = traversalStack[stackPtr].addr;
                                        nodeDist = traversalStack[stackPtr].dist;
@@ -160,7 +167,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                }
 
                                if(traverseChild != 0) {
-                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes,
+                                                                        nodeAddr*BVH_QNODE_SIZE+7);
 
                                        /* One child is hit, continue with that child. */
                                        int r = __bscf(traverseChild);
@@ -261,7 +269,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
 
 #ifdef __VISIBILITY_FLAG__
-                               if(UNLIKELY((nodeDist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
+                               if(UNLIKELY((nodeDist > isect->t) ||
+                                           ((__float_as_uint(leaf.z) & visibility) == 0)))
 #else
                                if(UNLIKELY((nodeDist > isect->t)))
 #endif
index ab2e530dd2082d5bde86061f332da869a54ca1a7..2aae90380c11341e83a9c0388a327147cf9681e0 100644 (file)
@@ -93,6 +93,16 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                do {
                        /* Traverse internal nodes. */
                        while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+#ifdef __VISIBILITY_FLAG__
+                               float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+0);
+                               if((__float_as_uint(inodes.x) & visibility) == 0) {
+                                       /* Pop. */
+                                       nodeAddr = traversalStack[stackPtr].addr;
+                                       --stackPtr;
+                                       continue;
+                               }
+#endif
+
                                ssef dist;
                                int traverseChild = qbvh_node_intersect(kg,
                                                                        tnear,
@@ -109,7 +119,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                                                        &dist);
 
                                if(traverseChild != 0) {
-                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes,
+                                                                        nodeAddr*BVH_QNODE_SIZE+7);
 
                                        /* One child is hit, continue with that child. */
                                        int r = __bscf(traverseChild);
index 5546471b0e310d214e9986834c71d2c1c36ed614..e6b668eb75881e678abdcc33ffe79dc721984209 100644 (file)
@@ -97,6 +97,16 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                do {
                        /* Traverse internal nodes. */
                        while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+#ifdef __VISIBILITY_FLAG__
+                               float4 inodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+0);
+                               if((__float_as_uint(inodes.x) & visibility) == 0) {
+                                       /* Pop. */
+                                       nodeAddr = traversalStack[stackPtr].addr;
+                                       --stackPtr;
+                                       continue;
+                               }
+#endif
+
                                ssef dist;
                                int traverseChild = qbvh_node_intersect(kg,
                                                                        tnear,
@@ -113,7 +123,8 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                                                        &dist);
 
                                if(traverseChild != 0) {
-                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+                                       float4 cnodes = kernel_tex_fetch(__bvh_nodes,
+                                                                        nodeAddr*BVH_QNODE_SIZE+7);
 
                                        /* One child is hit, continue with that child. */
                                        int r = __bscf(traverseChild);