2 * Adapted from code Copyright 2009-2010 NVIDIA Corporation
3 * Modifications Copyright 2011, Blender Foundation.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
21 * "Persistent while-while kernel" used in:
23 * "Understanding the Efficiency of Ray Traversal on GPUs",
24 * Timo Aila and Samuli Laine,
25 * Proc. High-Performance Graphics 2009
28 /* bottom-most stack entry, indicating the end of traversal */
30 #define ENTRYPOINT_SENTINEL 0x76543210
31 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
32 #define BVH_STACK_SIZE 192
33 #define BVH_NODE_SIZE 4
34 #define TRI_NODE_SIZE 3
36 /* silly workaround for float extended precision that happens when compiling
37 without sse support on x86, it results in different results for float ops
38 that you would otherwise expect to compare correctly */
39 #if !defined(__i386__) || defined(__SSE__)
40 #define NO_EXTENDED_PRECISION
42 #define NO_EXTENDED_PRECISION volatile
45 __device_inline float3 bvh_inverse_direction(float3 dir)
47 /* avoid divide by zero (ooeps = exp2f(-80.0f)) */
48 float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f;
51 idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x));
52 idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y));
53 idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
58 __device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
60 Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
62 *P = transform(&tfm, ray->P);
64 float3 dir = transform_direction(&tfm, ray->D);
67 dir = normalize_len(dir, &len);
69 *idir = bvh_inverse_direction(dir);
75 __device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
77 Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
80 *t *= len(transform_direction(&tfm, 1.0f/(*idir)));
83 *idir = bvh_inverse_direction(ray->D);
86 /* intersect two bounding boxes */
87 __device_inline void bvh_node_intersect(KernelGlobals *kg,
88 bool *traverseChild0, bool *traverseChild1,
89 bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
90 float3 P, float3 idir, float t, uint visibility, int nodeAddr)
93 float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
94 float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
95 float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
96 float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
98 /* intersect ray against child nodes */
99 float3 ood = P * idir;
100 float c0lox = n0xy.x * idir.x - ood.x;
101 float c0hix = n0xy.y * idir.x - ood.x;
102 float c0loy = n0xy.z * idir.y - ood.y;
103 float c0hiy = n0xy.w * idir.y - ood.y;
104 float c0loz = nz.x * idir.z - ood.z;
105 float c0hiz = nz.y * idir.z - ood.z;
106 NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
107 NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
109 float c1loz = nz.z * idir.z - ood.z;
110 float c1hiz = nz.w * idir.z - ood.z;
111 float c1lox = n1xy.x * idir.x - ood.x;
112 float c1hix = n1xy.y * idir.x - ood.x;
113 float c1loy = n1xy.z * idir.y - ood.y;
114 float c1hiy = n1xy.w * idir.y - ood.y;
115 NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
116 NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
118 /* decide which nodes to traverse next */
119 #ifdef __VISIBILITY_FLAG__
120 /* this visibility test gives a 5% performance hit, how to solve? */
121 *traverseChild0 = (c0max >= c0min) && (__float_as_int(cnodes.z) & visibility);
122 *traverseChild1 = (c1max >= c1min) && (__float_as_int(cnodes.w) & visibility);
124 *traverseChild0 = (c0max >= c0min);
125 *traverseChild1 = (c1max >= c1min);
128 *nodeAddr0 = __float_as_int(cnodes.x);
129 *nodeAddr1 = __float_as_int(cnodes.y);
131 *closestChild1 = (c1min < c0min);
134 /* Sven Woop's algorithm */
135 __device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
136 float3 P, float3 idir, uint visibility, int object, int triAddr)
138 /* compute and check intersection t-value */
139 float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
140 float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
141 float3 dir = 1.0f/idir;
143 float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
144 float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
145 float t = Oz * invDz;
147 if(t > 0.0f && t < isect->t) {
148 /* compute and check barycentric u */
149 float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
150 float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
154 /* compute and check barycentric v */
155 float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
156 float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
157 float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
160 if(v >= 0.0f && u + v <= 1.0f) {
161 #ifdef __VISIBILITY_FLAG__
162 /* visibility flag test. we do it here under the assumption
163 that most triangles are culled by node flags */
164 if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
167 /* record intersection */
168 isect->prim = triAddr;
169 isect->object = object;
179 __device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
181 /* traversal stack in CUDA thread-local memory */
182 int traversalStack[BVH_STACK_SIZE];
183 traversalStack[0] = ENTRYPOINT_SENTINEL;
185 /* traversal variables in registers */
187 int nodeAddr = kernel_data.bvh.root;
189 /* ray parameters in registers */
190 const float tmax = ray->t;
192 float3 idir = bvh_inverse_direction(ray->D);
205 /* traverse internal nodes */
206 while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
208 bool traverseChild0, traverseChild1, closestChild1;
211 bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
212 &closestChild1, &nodeAddr, &nodeAddrChild1,
213 P, idir, isect->t, visibility, nodeAddr);
215 if(traverseChild0 != traverseChild1) {
216 /* one child was intersected */
218 nodeAddr = nodeAddrChild1;
222 if(!traverseChild0) {
223 /* neither child was intersected */
224 nodeAddr = traversalStack[stackPtr];
228 /* both children were intersected, push the farther one */
231 nodeAddr = nodeAddrChild1;
232 nodeAddrChild1 = tmp;
236 traversalStack[stackPtr] = nodeAddrChild1;
241 /* if node is leaf, fetch triangle list */
243 float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
244 int primAddr = __float_as_int(leaf.x);
246 #ifdef __INSTANCING__
249 int primAddr2 = __float_as_int(leaf.y);
252 nodeAddr = traversalStack[stackPtr];
255 /* triangle intersection */
256 while(primAddr < primAddr2) {
257 /* intersect ray against triangle */
258 bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
260 /* shadow ray early termination */
261 if(visibility == PATH_RAY_SHADOW_OPAQUE && isect->prim != ~0)
266 #ifdef __INSTANCING__
270 object = kernel_tex_fetch(__prim_object, -primAddr-1);
272 bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
275 traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
277 nodeAddr = kernel_tex_fetch(__object_node, object);
281 } while(nodeAddr != ENTRYPOINT_SENTINEL);
283 #ifdef __INSTANCING__
285 kernel_assert(object != ~0);
288 bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
290 nodeAddr = traversalStack[stackPtr];
294 } while(nodeAddr != ENTRYPOINT_SENTINEL);
296 return (isect->prim != ~0);
299 __device_inline float3 ray_offset(float3 P, float3 Ng)
301 #ifdef __INTERSECTION_REFINE__
302 const float epsilon_f = 1e-5f;
303 const int epsilon_i = 32;
308 if(fabsf(P.x) < epsilon_f) {
309 res.x = P.x + Ng.x*epsilon_f;
312 uint ix = __float_as_uint(P.x);
313 ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
314 res.x = __uint_as_float(ix);
318 if(fabsf(P.y) < epsilon_f) {
319 res.y = P.y + Ng.y*epsilon_f;
322 uint iy = __float_as_uint(P.y);
323 iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
324 res.y = __uint_as_float(iy);
328 if(fabsf(P.z) < epsilon_f) {
329 res.z = P.z + Ng.z*epsilon_f;
332 uint iz = __float_as_uint(P.z);
333 iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
334 res.z = __uint_as_float(iz);
339 const float epsilon_f = 1e-4f;
340 return P + epsilon_f*Ng;
344 __device_inline float3 bvh_triangle_refine(KernelGlobals *kg, const Intersection *isect, const Ray *ray)
350 #ifdef __INTERSECTION_REFINE__
351 if(isect->object != ~0) {
352 Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
354 P = transform(&tfm, P);
355 D = transform_direction(&tfm, D*t);
356 D = normalize_len(D, &t);
361 float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
362 float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
363 float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
364 float rt = Oz * invDz;
368 if(isect->object != ~0) {
369 Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
370 P = transform(&tfm, P);