2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 /* This is a template BVH traversal function, where various features can be
18 * enabled/disabled. This way we can compile optimized versions for each case
19 * without new features slowing things down.
21 * BVH_INSTANCING: object instancing
22 * BVH_HAIR: hair curve rendering
23 * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
24 * BVH_MOTION: motion blur rendering
28 #if BVH_FEATURE(BVH_HAIR)
29 # define NODE_INTERSECT obvh_node_intersect
30 # define NODE_INTERSECT_ROBUST obvh_node_intersect_robust
32 # define NODE_INTERSECT obvh_aligned_node_intersect
33 # define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust
36 ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
40 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
47 /* Traversal stack in CUDA thread-local memory. */
48 OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
49 traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
50 traversal_stack[0].dist = -FLT_MAX;
52 /* Traversal variables in registers. */
54 int node_addr = kernel_data.bvh.root;
55 float node_dist = -FLT_MAX;
57 /* Ray parameters in registers. */
59 float3 dir = bvh_clamp_direction(ray->D);
60 float3 idir = bvh_inverse_direction(dir);
61 int object = OBJECT_NONE;
63 #if BVH_FEATURE(BVH_MOTION)
67 #ifndef __KERNEL_SSE41__
76 isect->prim = PRIM_NONE;
77 isect->object = OBJECT_NONE;
80 avxf tnear(0.0f), tfar(ray->t);
81 #if BVH_FEATURE(BVH_HAIR)
82 avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
84 avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
86 #ifdef __KERNEL_AVX2__
87 float3 P_idir = P*idir;
88 avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
90 #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
91 avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
94 /* Offsets to select the side that becomes the lower or upper bound. */
95 int near_x, near_y, near_z;
96 int far_x, far_y, far_z;
97 obvh_near_far_idx_calc(idir,
98 &near_x, &near_y, &near_z,
99 &far_x, &far_y, &far_z);
100 /* Traversal loop. */
103 /* Traverse internal nodes. */
104 while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
105 float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
108 if(UNLIKELY(node_dist > isect->t)
109 #if BVH_FEATURE(BVH_MOTION)
110 || UNLIKELY(ray->time < inodes.y)
111 || UNLIKELY(ray->time > inodes.z)
113 #ifdef __VISIBILITY_FLAG__
114 || (__float_as_uint(inodes.x) & visibility) == 0
119 node_addr = traversal_stack[stack_ptr].addr;
120 node_dist = traversal_stack[stack_ptr].dist;
128 BVH_DEBUG_NEXT_NODE();
130 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
132 /* NOTE: We extend all the child BB instead of fetching
133 * and checking visibility flags for each of the,
135 * Need to test if doing opposite would be any faster.
137 child_mask = NODE_INTERSECT_ROBUST(kg,
140 # ifdef __KERNEL_AVX2__
143 # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
146 # if BVH_FEATURE(BVH_HAIR)
150 near_x, near_y, near_z,
157 #endif /* BVH_HAIR_MINIMUM_WIDTH */
159 child_mask = NODE_INTERSECT(kg,
162 #ifdef __KERNEL_AVX2__
165 #if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
168 #if BVH_FEATURE(BVH_HAIR)
172 near_x, near_y, near_z,
178 if(child_mask != 0) {
180 /* TODO(sergey): Investigate whether moving cnodes upwards
181 * gives a speedup (will be different cache pattern but will
182 * avoid extra check here),
184 #if BVH_FEATURE(BVH_HAIR)
185 if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
186 cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+26);
191 cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr+14);
194 /* One child is hit, continue with that child. */
195 int r = __bscf(child_mask);
196 float d0 = ((float*)&dist)[r];
197 if(child_mask == 0) {
198 node_addr = __float_as_int(cnodes[r]);
203 /* Two children are hit, push far child, and continue with
206 int c0 = __float_as_int(cnodes[r]);
207 r = __bscf(child_mask);
208 int c1 = __float_as_int(cnodes[r]);
209 float d1 = ((float*)&dist)[r];
210 if(child_mask == 0) {
215 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
216 traversal_stack[stack_ptr].addr = c0;
217 traversal_stack[stack_ptr].dist = d0;
224 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
225 traversal_stack[stack_ptr].addr = c1;
226 traversal_stack[stack_ptr].dist = d1;
231 /* Here starts the slow path for 3 or 4 hit children. We push
232 * all nodes onto the stack to sort them there.
235 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
236 traversal_stack[stack_ptr].addr = c1;
237 traversal_stack[stack_ptr].dist = d1;
239 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
240 traversal_stack[stack_ptr].addr = c0;
241 traversal_stack[stack_ptr].dist = d0;
243 /* Three children are hit, push all onto stack and sort 3
244 * stack items, continue with closest child.
246 r = __bscf(child_mask);
247 int c2 = __float_as_int(cnodes[r]);
248 float d2 = ((float*)&dist)[r];
249 if(child_mask == 0) {
251 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
252 traversal_stack[stack_ptr].addr = c2;
253 traversal_stack[stack_ptr].dist = d2;
254 obvh_stack_sort(&traversal_stack[stack_ptr],
255 &traversal_stack[stack_ptr - 1],
256 &traversal_stack[stack_ptr - 2]);
257 node_addr = traversal_stack[stack_ptr].addr;
258 node_dist = traversal_stack[stack_ptr].dist;
263 /* Four children are hit, push all onto stack and sort 4
264 * stack items, continue with closest child.
266 r = __bscf(child_mask);
267 int c3 = __float_as_int(cnodes[r]);
268 float d3 = ((float*)&dist)[r];
269 if(child_mask == 0) {
271 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
272 traversal_stack[stack_ptr].addr = c3;
273 traversal_stack[stack_ptr].dist = d3;
275 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
276 traversal_stack[stack_ptr].addr = c2;
277 traversal_stack[stack_ptr].dist = d2;
278 obvh_stack_sort(&traversal_stack[stack_ptr],
279 &traversal_stack[stack_ptr - 1],
280 &traversal_stack[stack_ptr - 2],
281 &traversal_stack[stack_ptr - 3]);
282 node_addr = traversal_stack[stack_ptr].addr;
283 node_dist = traversal_stack[stack_ptr].dist;
289 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
290 traversal_stack[stack_ptr].addr = c3;
291 traversal_stack[stack_ptr].dist = d3;
293 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
294 traversal_stack[stack_ptr].addr = c2;
295 traversal_stack[stack_ptr].dist = d2;
297 /* Five children are hit, push all onto stack and sort 5
298 * stack items, continue with closest child.
300 r = __bscf(child_mask);
301 int c4 = __float_as_int(cnodes[r]);
302 float d4 = ((float*)&dist)[r];
303 if(child_mask == 0) {
305 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
306 traversal_stack[stack_ptr].addr = c4;
307 traversal_stack[stack_ptr].dist = d4;
308 obvh_stack_sort(&traversal_stack[stack_ptr],
309 &traversal_stack[stack_ptr - 1],
310 &traversal_stack[stack_ptr - 2],
311 &traversal_stack[stack_ptr - 3],
312 &traversal_stack[stack_ptr - 4]);
313 node_addr = traversal_stack[stack_ptr].addr;
314 node_dist = traversal_stack[stack_ptr].dist;
319 /* Six children are hit, push all onto stack and sort 6
320 * stack items, continue with closest child.
322 r = __bscf(child_mask);
323 int c5 = __float_as_int(cnodes[r]);
324 float d5 = ((float*)&dist)[r];
325 if(child_mask == 0) {
327 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
328 traversal_stack[stack_ptr].addr = c5;
329 traversal_stack[stack_ptr].dist = d5;
331 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
332 traversal_stack[stack_ptr].addr = c4;
333 traversal_stack[stack_ptr].dist = d4;
334 obvh_stack_sort(&traversal_stack[stack_ptr],
335 &traversal_stack[stack_ptr - 1],
336 &traversal_stack[stack_ptr - 2],
337 &traversal_stack[stack_ptr - 3],
338 &traversal_stack[stack_ptr - 4],
339 &traversal_stack[stack_ptr - 5]);
340 node_addr = traversal_stack[stack_ptr].addr;
341 node_dist = traversal_stack[stack_ptr].dist;
347 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
348 traversal_stack[stack_ptr].addr = c5;
349 traversal_stack[stack_ptr].dist = d5;
351 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
352 traversal_stack[stack_ptr].addr = c4;
353 traversal_stack[stack_ptr].dist = d4;
355 /* Seven children are hit, push all onto stack and sort 7
356 * stack items, continue with closest child.
358 r = __bscf(child_mask);
359 int c6 = __float_as_int(cnodes[r]);
360 float d6 = ((float*)&dist)[r];
361 if(child_mask == 0) {
363 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
364 traversal_stack[stack_ptr].addr = c6;
365 traversal_stack[stack_ptr].dist = d6;
366 obvh_stack_sort(&traversal_stack[stack_ptr],
367 &traversal_stack[stack_ptr - 1],
368 &traversal_stack[stack_ptr - 2],
369 &traversal_stack[stack_ptr - 3],
370 &traversal_stack[stack_ptr - 4],
371 &traversal_stack[stack_ptr - 5],
372 &traversal_stack[stack_ptr - 6]);
373 node_addr = traversal_stack[stack_ptr].addr;
374 node_dist = traversal_stack[stack_ptr].dist;
379 /* Eight children are hit, push all onto stack and sort 8
380 * stack items, continue with closest child.
382 r = __bscf(child_mask);
383 int c7 = __float_as_int(cnodes[r]);
384 float d7 = ((float*)&dist)[r];
386 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
387 traversal_stack[stack_ptr].addr = c7;
388 traversal_stack[stack_ptr].dist = d7;
390 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
391 traversal_stack[stack_ptr].addr = c6;
392 traversal_stack[stack_ptr].dist = d6;
393 obvh_stack_sort(&traversal_stack[stack_ptr],
394 &traversal_stack[stack_ptr - 1],
395 &traversal_stack[stack_ptr - 2],
396 &traversal_stack[stack_ptr - 3],
397 &traversal_stack[stack_ptr - 4],
398 &traversal_stack[stack_ptr - 5],
399 &traversal_stack[stack_ptr - 6],
400 &traversal_stack[stack_ptr - 7]);
401 node_addr = traversal_stack[stack_ptr].addr;
402 node_dist = traversal_stack[stack_ptr].dist;
408 node_addr = traversal_stack[stack_ptr].addr;
409 node_dist = traversal_stack[stack_ptr].dist;
413 /* If node is leaf, fetch triangle list. */
415 float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
417 #ifdef __VISIBILITY_FLAG__
418 if(UNLIKELY((node_dist > isect->t) ||
419 ((__float_as_uint(leaf.z) & visibility) == 0)))
421 if(UNLIKELY((node_dist > isect->t)))
425 node_addr = traversal_stack[stack_ptr].addr;
426 node_dist = traversal_stack[stack_ptr].dist;
430 int prim_addr = __float_as_int(leaf.x);
432 #if BVH_FEATURE(BVH_INSTANCING)
435 int prim_addr2 = __float_as_int(leaf.y);
436 const uint type = __float_as_int(leaf.w);
439 node_addr = traversal_stack[stack_ptr].addr;
440 node_dist = traversal_stack[stack_ptr].dist;
443 /* Primitive intersection. */
444 switch(type & PRIMITIVE_ALL) {
445 case PRIMITIVE_TRIANGLE: {
446 int prim_count = prim_addr2 - prim_addr;
448 for(; prim_addr < prim_addr2; prim_addr++) {
449 BVH_DEBUG_NEXT_INTERSECTION();
450 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
451 if(triangle_intersect(kg,
459 tfar = avxf(isect->t);
460 /* Shadow ray early termination. */
461 if(visibility == PATH_RAY_SHADOW_OPAQUE) {
468 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
469 if(triangle_intersect8(kg,
482 tfar = avxf(isect->t);
483 if(visibility == PATH_RAY_SHADOW_OPAQUE) {
490 #if BVH_FEATURE(BVH_MOTION)
491 case PRIMITIVE_MOTION_TRIANGLE: {
492 for(; prim_addr < prim_addr2; prim_addr++) {
493 BVH_DEBUG_NEXT_INTERSECTION();
494 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
495 if(motion_triangle_intersect(kg,
504 tfar = avxf(isect->t);
505 /* Shadow ray early termination. */
506 if(visibility == PATH_RAY_SHADOW_OPAQUE) {
513 #endif /* BVH_FEATURE(BVH_MOTION) */
514 #if BVH_FEATURE(BVH_HAIR)
515 case PRIMITIVE_CURVE:
516 case PRIMITIVE_MOTION_CURVE: {
517 for(; prim_addr < prim_addr2; prim_addr++) {
518 BVH_DEBUG_NEXT_INTERSECTION();
519 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
520 kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
522 if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
523 hit = cardinal_curve_intersect(kg,
537 hit = curve_intersect(kg,
551 tfar = avxf(isect->t);
552 /* Shadow ray early termination. */
553 if(visibility == PATH_RAY_SHADOW_OPAQUE) {
560 #endif /* BVH_FEATURE(BVH_HAIR) */
563 #if BVH_FEATURE(BVH_INSTANCING)
566 object = kernel_tex_fetch(__prim_object, -prim_addr-1);
568 # if BVH_FEATURE(BVH_MOTION)
569 qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
571 qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
574 obvh_near_far_idx_calc(idir,
575 &near_x, &near_y, &near_z,
576 &far_x, &far_y, &far_z);
577 tfar = avxf(isect->t);
578 # if BVH_FEATURE(BVH_HAIR)
579 dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
581 idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
582 # ifdef __KERNEL_AVX2__
584 P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
586 # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
587 org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
591 kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
592 traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
593 traversal_stack[stack_ptr].dist = -FLT_MAX;
595 node_addr = kernel_tex_fetch(__object_node, object);
597 BVH_DEBUG_NEXT_INSTANCE();
600 #endif /* FEATURE(BVH_INSTANCING) */
601 } while(node_addr != ENTRYPOINT_SENTINEL);
603 #if BVH_FEATURE(BVH_INSTANCING)
605 kernel_assert(object != OBJECT_NONE);
608 # if BVH_FEATURE(BVH_MOTION)
609 isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
611 isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
614 obvh_near_far_idx_calc(idir,
615 &near_x, &near_y, &near_z,
616 &far_x, &far_y, &far_z);
617 tfar = avxf(isect->t);
618 # if BVH_FEATURE(BVH_HAIR)
619 dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
621 idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
622 # ifdef __KERNEL_AVX2__
624 P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
626 # if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
627 org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
630 object = OBJECT_NONE;
631 node_addr = traversal_stack[stack_ptr].addr;
632 node_dist = traversal_stack[stack_ptr].dist;
635 #endif /* FEATURE(BVH_INSTANCING) */
636 } while(node_addr != ENTRYPOINT_SENTINEL);
638 return (isect->prim != PRIM_NONE);
641 #undef NODE_INTERSECT
642 #undef NODE_INTERSECT_ROBUST