Fix Cycles CUDA performance on CUDA 8.0.
[blender-staging.git] / intern / cycles / kernel / bvh / bvh_nodes.h
1 /*
2  * Copyright 2011-2016, Blender Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
18 // 3-vector which might be faster.
19 ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
20                                                            int node_addr,
21                                                            int child)
22 {
23         Transform space;
24         const int child_addr = node_addr + child * 3;
25         space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1);
26         space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2);
27         space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3);
28         space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
29         return space;
30 }
31
32 #if !defined(__KERNEL_SSE2__)
33 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
34                                                  const float3 P,
35                                                  const float3 idir,
36                                                  const float t,
37                                                  const int node_addr,
38                                                  const uint visibility,
39                                                  float dist[2])
40 {
41
42         /* fetch node data */
43         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
44         float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1);
45         float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2);
46         float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3);
47
48         /* intersect ray against child nodes */
49         float c0lox = (node0.x - P.x) * idir.x;
50         float c0hix = (node0.z - P.x) * idir.x;
51         float c0loy = (node1.x - P.y) * idir.y;
52         float c0hiy = (node1.z - P.y) * idir.y;
53         float c0loz = (node2.x - P.z) * idir.z;
54         float c0hiz = (node2.z - P.z) * idir.z;
55         float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
56         float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
57
58         float c1lox = (node0.y - P.x) * idir.x;
59         float c1hix = (node0.w - P.x) * idir.x;
60         float c1loy = (node1.y - P.y) * idir.y;
61         float c1hiy = (node1.w - P.y) * idir.y;
62         float c1loz = (node2.y - P.z) * idir.z;
63         float c1hiz = (node2.w - P.z) * idir.z;
64         float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
65         float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
66
67         dist[0] = c0min;
68         dist[1] = c1min;
69
70 #ifdef __VISIBILITY_FLAG__
71         /* this visibility test gives a 5% performance hit, how to solve? */
72         return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
73                (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
74 #else
75         return ((c0max >= c0min)? 1: 0) |
76                ((c1max >= c1min)? 2: 0);
77 #endif
78 }
79
80 ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
81                                                         const float3 P,
82                                                         const float3 idir,
83                                                         const float t,
84                                                         const float difl,
85                                                         const float extmax,
86                                                         const int node_addr,
87                                                         const uint visibility,
88                                                         float dist[2])
89 {
90
91         /* fetch node data */
92         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
93         float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1);
94         float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2);
95         float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3);
96
97         /* intersect ray against child nodes */
98         float c0lox = (node0.x - P.x) * idir.x;
99         float c0hix = (node0.z - P.x) * idir.x;
100         float c0loy = (node1.x - P.y) * idir.y;
101         float c0hiy = (node1.z - P.y) * idir.y;
102         float c0loz = (node2.x - P.z) * idir.z;
103         float c0hiz = (node2.z - P.z) * idir.z;
104         float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
105         float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
106
107         float c1lox = (node0.y - P.x) * idir.x;
108         float c1hix = (node0.w - P.x) * idir.x;
109         float c1loy = (node1.y - P.y) * idir.y;
110         float c1hiy = (node1.w - P.y) * idir.y;
111         float c1loz = (node2.y - P.z) * idir.z;
112         float c1hiz = (node2.w - P.z) * idir.z;
113         float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
114         float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
115
116         if(difl != 0.0f) {
117                 float hdiff = 1.0f + difl;
118                 float ldiff = 1.0f - difl;
119                 if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
120                         c0min = max(ldiff * c0min, c0min - extmax);
121                         c0max = min(hdiff * c0max, c0max + extmax);
122                 }
123                 if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
124                         c1min = max(ldiff * c1min, c1min - extmax);
125                         c1max = min(hdiff * c1max, c1max + extmax);
126                 }
127         }
128
129         dist[0] = c0min;
130         dist[1] = c1min;
131
132 #ifdef __VISIBILITY_FLAG__
133         /* this visibility test gives a 5% performance hit, how to solve? */
134         return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
135                (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
136 #else
137         return ((c0max >= c0min)? 1: 0) |
138                ((c1max >= c1min)? 2: 0);
139 #endif
140 }
141
142 ccl_device_forceinline bool bvh_unaligned_node_intersect_child(
143         KernelGlobals *kg,
144         const float3 P,
145         const float3 dir,
146         const float t,
147         int node_addr,
148         int child,
149         float dist[2])
150 {
151         Transform space  = bvh_unaligned_node_fetch_space(kg, node_addr, child);
152         float3 aligned_dir = transform_direction(&space, dir);
153         float3 aligned_P = transform_point(&space, P);
154         float3 nrdir = -bvh_inverse_direction(aligned_dir);
155         float3 lower_xyz = aligned_P * nrdir;
156         float3 upper_xyz = lower_xyz - nrdir;
157         const float near_x = min(lower_xyz.x, upper_xyz.x);
158         const float near_y = min(lower_xyz.y, upper_xyz.y);
159         const float near_z = min(lower_xyz.z, upper_xyz.z);
160         const float far_x  = max(lower_xyz.x, upper_xyz.x);
161         const float far_y  = max(lower_xyz.y, upper_xyz.y);
162         const float far_z  = max(lower_xyz.z, upper_xyz.z);
163         const float tnear   = max4(0.0f, near_x, near_y, near_z);
164         const float tfar    = min4(t, far_x, far_y, far_z);
165         *dist = tnear;
166         return tnear <= tfar;
167 }
168
169 ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
170         KernelGlobals *kg,
171         const float3 P,
172         const float3 dir,
173         const float t,
174         const float difl,
175         int node_addr,
176         int child,
177         float dist[2])
178 {
179         Transform space  = bvh_unaligned_node_fetch_space(kg, node_addr, child);
180         float3 aligned_dir = transform_direction(&space, dir);
181         float3 aligned_P = transform_point(&space, P);
182         float3 nrdir = -bvh_inverse_direction(aligned_dir);
183         float3 tLowerXYZ = aligned_P * nrdir;
184         float3 tUpperXYZ = tLowerXYZ - nrdir;
185         const float near_x = min(tLowerXYZ.x, tUpperXYZ.x);
186         const float near_y = min(tLowerXYZ.y, tUpperXYZ.y);
187         const float near_z = min(tLowerXYZ.z, tUpperXYZ.z);
188         const float far_x  = max(tLowerXYZ.x, tUpperXYZ.x);
189         const float far_y  = max(tLowerXYZ.y, tUpperXYZ.y);
190         const float far_z  = max(tLowerXYZ.z, tUpperXYZ.z);
191         const float tnear   = max4(0.0f, near_x, near_y, near_z);
192         const float tfar    = min4(t, far_x, far_y, far_z);
193         *dist = tnear;
194         if(difl != 0.0f) {
195                 /* TODO(sergey): Same as for QBVH, needs a proper use. */
196                 const float round_down = 1.0f - difl;
197                 const float round_up = 1.0f + difl;
198                 return round_down*tnear <= round_up*tfar;
199         }
200         else {
201                 return tnear <= tfar;
202         }
203 }
204
205 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
206                                                    const float3 P,
207                                                    const float3 dir,
208                                                    const float3 idir,
209                                                    const float t,
210                                                    const int node_addr,
211                                                    const uint visibility,
212                                                    float dist[2])
213 {
214         int mask = 0;
215         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
216         if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
217 #ifdef __VISIBILITY_FLAG__
218                 if((__float_as_uint(cnodes.x) & visibility))
219 #endif
220                 {
221                         mask |= 1;
222                 }
223         }
224         if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
225 #ifdef __VISIBILITY_FLAG__
226                 if((__float_as_uint(cnodes.y) & visibility))
227 #endif
228                 {
229                         mask |= 2;
230                 }
231         }
232         return mask;
233 }
234
235 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
236                                                           const float3 P,
237                                                           const float3 dir,
238                                                           const float3 idir,
239                                                           const float t,
240                                                           const float difl,
241                                                           const float extmax,
242                                                           const int node_addr,
243                                                           const uint visibility,
244                                                           float dist[2])
245 {
246         int mask = 0;
247         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
248         if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) {
249 #ifdef __VISIBILITY_FLAG__
250                 if((__float_as_uint(cnodes.x) & visibility))
251 #endif
252                 {
253                         mask |= 1;
254                 }
255         }
256         if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) {
257 #ifdef __VISIBILITY_FLAG__
258                 if((__float_as_uint(cnodes.y) & visibility))
259 #endif
260                 {
261                         mask |= 2;
262                 }
263         }
264         return mask;
265 }
266
267 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
268                                          const float3 P,
269                                          const float3 dir,
270                                          const float3 idir,
271                                          const float t,
272                                          const int node_addr,
273                                          const uint visibility,
274                                          float dist[2])
275 {
276         float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
277         if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
278                 return bvh_unaligned_node_intersect(kg,
279                                                     P,
280                                                     dir,
281                                                     idir,
282                                                     t,
283                                                     node_addr,
284                                                     visibility,
285                                                     dist);
286         }
287         else {
288                 return bvh_aligned_node_intersect(kg,
289                                                   P,
290                                                   idir,
291                                                   t,
292                                                   node_addr,
293                                                   visibility,
294                                                   dist);
295         }
296 }
297
298 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
299                                                 const float3 P,
300                                                 const float3 dir,
301                                                 const float3 idir,
302                                                 const float t,
303                                                 const float difl,
304                                                 const float extmax,
305                                                 const int node_addr,
306                                                 const uint visibility,
307                                                 float dist[2])
308 {
309         float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
310         if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
311                 return bvh_unaligned_node_intersect_robust(kg,
312                                                            P,
313                                                            dir,
314                                                            idir,
315                                                            t,
316                                                            difl,
317                                                            extmax,
318                                                            node_addr,
319                                                            visibility,
320                                                            dist);
321         }
322         else {
323                 return bvh_aligned_node_intersect_robust(kg,
324                                                          P,
325                                                          idir,
326                                                          t,
327                                                          difl,
328                                                          extmax,
329                                                          node_addr,
330                                                          visibility,
331                                                          dist);
332         }
333 }
334 #else  /* !defined(__KERNEL_SSE2__) */
335
336 int ccl_device_forceinline bvh_aligned_node_intersect(
337         KernelGlobals *kg,
338         const float3& P,
339         const float3& dir,
340         const ssef& tsplat,
341         const ssef Psplat[3],
342         const ssef idirsplat[3],
343         const shuffle_swap_t shufflexyz[3],
344         const int node_addr,
345         const uint visibility,
346         float dist[2])
347 {
348         /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
349         const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
350
351         /* fetch node data */
352         const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + node_addr;
353
354         /* intersect ray against child nodes */
355         const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
356         const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
357         const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
358
359         /* calculate { c0min, c1min, -c0max, -c1max} */
360         ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
361         const ssef tminmax = minmax ^ pn;
362         const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
363
364         dist[0] = tminmax[0];
365         dist[1] = tminmax[1];
366
367         int mask = movemask(lrhit);
368
369 #  ifdef __VISIBILITY_FLAG__
370         /* this visibility test gives a 5% performance hit, how to solve? */
371         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
372         int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
373                     (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
374         return cmask;
375 #  else
376         return mask & 3;
377 #  endif
378 }
379
380 ccl_device_forceinline int bvh_aligned_node_intersect_robust(
381         KernelGlobals *kg,
382         const float3& P,
383         const float3& dir,
384         const ssef& tsplat,
385         const ssef Psplat[3],
386         const ssef idirsplat[3],
387         const shuffle_swap_t shufflexyz[3],
388         const float difl,
389         const float extmax,
390         const int nodeAddr,
391         const uint visibility,
392         float dist[2])
393 {
394         /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
395         const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
396
397         /* fetch node data */
398         const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr;
399
400         /* intersect ray against child nodes */
401         const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
402         const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
403         const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
404
405         /* calculate { c0min, c1min, -c0max, -c1max} */
406         ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
407         const ssef tminmax = minmax ^ pn;
408
409         if(difl != 0.0f) {
410                 float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
411                 float4 *tminmaxview = (float4*)&tminmax;
412                 float& c0min = tminmaxview->x, &c1min = tminmaxview->y;
413                 float& c0max = tminmaxview->z, &c1max = tminmaxview->w;
414                 float hdiff = 1.0f + difl;
415                 float ldiff = 1.0f - difl;
416                 if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
417                         c0min = max(ldiff * c0min, c0min - extmax);
418                         c0max = min(hdiff * c0max, c0max + extmax);
419                 }
420                 if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
421                         c1min = max(ldiff * c1min, c1min - extmax);
422                         c1max = min(hdiff * c1max, c1max + extmax);
423                 }
424         }
425
426         const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
427
428         dist[0] = tminmax[0];
429         dist[1] = tminmax[1];
430
431         int mask = movemask(lrhit);
432
433 #  ifdef __VISIBILITY_FLAG__
434         /* this visibility test gives a 5% performance hit, how to solve? */
435         float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0);
436         int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
437                     (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
438         return cmask;
439 #  else
440         return mask & 3;
441 #  endif
442 }
443
444 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
445                                                    const float3 P,
446                                                    const float3 dir,
447                                                    const ssef& isect_near,
448                                                    const ssef& isect_far,
449                                                    const int node_addr,
450                                                    const uint visibility,
451                                                    float dist[2])
452 {
453         Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
454         Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
455
456         float3 aligned_dir0 = transform_direction(&space0, dir),
457                aligned_dir1 = transform_direction(&space1, dir);;
458         float3 aligned_P0 = transform_point(&space0, P),
459                aligned_P1 = transform_point(&space1, P);
460         float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
461                nrdir1 = -bvh_inverse_direction(aligned_dir1);
462
463         ssef lower_x = ssef(aligned_P0.x * nrdir0.x,
464                             aligned_P1.x * nrdir1.x,
465                             0.0f, 0.0f),
466              lower_y = ssef(aligned_P0.y * nrdir0.y,
467                             aligned_P1.y * nrdir1.y,
468                             0.0f,
469                             0.0f),
470              lower_z = ssef(aligned_P0.z * nrdir0.z,
471                             aligned_P1.z * nrdir1.z,
472                             0.0f,
473                             0.0f);
474
475         ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
476              upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
477              upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
478
479         ssef tnear_x = min(lower_x, upper_x);
480         ssef tnear_y = min(lower_y, upper_y);
481         ssef tnear_z = min(lower_z, upper_z);
482         ssef tfar_x = max(lower_x, upper_x);
483         ssef tfar_y = max(lower_y, upper_y);
484         ssef tfar_z = max(lower_z, upper_z);
485
486         const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
487         const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
488         sseb vmask = tnear <= tfar;
489         dist[0] = tnear.f[0];
490         dist[1] = tnear.f[1];
491
492         int mask = (int)movemask(vmask);
493
494 #  ifdef __VISIBILITY_FLAG__
495         /* this visibility test gives a 5% performance hit, how to solve? */
496         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
497         int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
498                     (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
499         return cmask;
500 #  else
501         return mask & 3;
502 #  endif
503 }
504
505 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
506                                                           const float3 P,
507                                                           const float3 dir,
508                                                           const ssef& isect_near,
509                                                           const ssef& isect_far,
510                                                           const float difl,
511                                                           const int node_addr,
512                                                           const uint visibility,
513                                                           float dist[2])
514 {
515         Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
516         Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
517
518         float3 aligned_dir0 = transform_direction(&space0, dir),
519                aligned_dir1 = transform_direction(&space1, dir);;
520         float3 aligned_P0 = transform_point(&space0, P),
521                aligned_P1 = transform_point(&space1, P);
522         float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
523                nrdir1 = -bvh_inverse_direction(aligned_dir1);
524
525         ssef lower_x = ssef(aligned_P0.x * nrdir0.x,
526                             aligned_P1.x * nrdir1.x,
527                             0.0f, 0.0f),
528              lower_y = ssef(aligned_P0.y * nrdir0.y,
529                             aligned_P1.y * nrdir1.y,
530                             0.0f,
531                             0.0f),
532              lower_z = ssef(aligned_P0.z * nrdir0.z,
533                             aligned_P1.z * nrdir1.z,
534                             0.0f,
535                             0.0f);
536
537         ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
538              upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
539              upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
540
541         ssef tnear_x = min(lower_x, upper_x);
542         ssef tnear_y = min(lower_y, upper_y);
543         ssef tnear_z = min(lower_z, upper_z);
544         ssef tfar_x = max(lower_x, upper_x);
545         ssef tfar_y = max(lower_y, upper_y);
546         ssef tfar_z = max(lower_z, upper_z);
547
548         const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
549         const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
550         sseb vmask;
551         if(difl != 0.0f) {
552                 const float round_down = 1.0f - difl;
553                 const float round_up = 1.0f + difl;
554                 vmask = round_down*tnear <= round_up*tfar;
555         }
556         else {
557                 vmask = tnear <= tfar;
558         }
559
560         dist[0] = tnear.f[0];
561         dist[1] = tnear.f[1];
562
563         int mask = (int)movemask(vmask);
564
565 #  ifdef __VISIBILITY_FLAG__
566         /* this visibility test gives a 5% performance hit, how to solve? */
567         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
568         int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) |
569                     (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0);
570         return cmask;
571 #  else
572         return mask & 3;
573 #  endif
574 }
575
576 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
577                                          const float3& P,
578                                          const float3& dir,
579                                          const ssef& isect_near,
580                                          const ssef& isect_far,
581                                          const ssef& tsplat,
582                                          const ssef Psplat[3],
583                                          const ssef idirsplat[3],
584                                          const shuffle_swap_t shufflexyz[3],
585                                          const int node_addr,
586                                          const uint visibility,
587                                          float dist[2])
588 {
589         float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
590         if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
591                 return bvh_unaligned_node_intersect(kg,
592                                                     P,
593                                                     dir,
594                                                     isect_near,
595                                                     isect_far,
596                                                     node_addr,
597                                                     visibility,
598                                                     dist);
599         }
600         else {
601                 return bvh_aligned_node_intersect(kg,
602                                                   P,
603                                                   dir,
604                                                   tsplat,
605                                                   Psplat,
606                                                   idirsplat,
607                                                   shufflexyz,
608                                                   node_addr,
609                                                   visibility,
610                                                   dist);
611         }
612 }
613
614 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
615                                                 const float3& P,
616                                                 const float3& dir,
617                                                 const ssef& isect_near,
618                                                 const ssef& isect_far,
619                                                 const ssef& tsplat,
620                                                 const ssef Psplat[3],
621                                                 const ssef idirsplat[3],
622                                                 const shuffle_swap_t shufflexyz[3],
623                                                 const float difl,
624                                                 const float extmax,
625                                                 const int node_addr,
626                                                 const uint visibility,
627                                                 float dist[2])
628 {
629         float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
630         if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
631                 return bvh_unaligned_node_intersect_robust(kg,
632                                                            P,
633                                                            dir,
634                                                            isect_near,
635                                                            isect_far,
636                                                            difl,
637                                                            node_addr,
638                                                            visibility,
639                                                            dist);
640         }
641         else {
642                 return bvh_aligned_node_intersect_robust(kg,
643                                                          P,
644                                                          dir,
645                                                          tsplat,
646                                                          Psplat,
647                                                          idirsplat,
648                                                          shufflexyz,
649                                                          difl,
650                                                          extmax,
651                                                          node_addr,
652                                                          visibility,
653                                                          dist);
654         }
655 }
656 #endif  /* !defined(__KERNEL_SSE2__) */