Merging r46111 through r46136 from trunk into soc-2011-tomato
[blender-staging.git] / intern / cycles / kernel / kernel_bvh.h
1 /*
2  * Adapted from code Copyright 2009-2010 NVIDIA Corporation
3  * Modifications Copyright 2011, Blender Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 CCL_NAMESPACE_BEGIN
19
20 /*
21  * "Persistent while-while kernel" used in:
22  *
23  * "Understanding the Efficiency of Ray Traversal on GPUs",
24  * Timo Aila and Samuli Laine,
25  * Proc. High-Performance Graphics 2009
26  */
27
28 /* bottom-most stack entry, indicating the end of traversal */
29
30 #define ENTRYPOINT_SENTINEL 0x76543210
31 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
32 #define BVH_STACK_SIZE 192
33 #define BVH_NODE_SIZE 4
34 #define TRI_NODE_SIZE 3
35
36 /* silly workaround for float extended precision that happens when compiling
37    without sse support on x86, it results in different results for float ops
38    that you would otherwise expect to compare correctly */
39 #if !defined(__i386__) || defined(__SSE__)
40 #define NO_EXTENDED_PRECISION
41 #else
42 #define NO_EXTENDED_PRECISION volatile
43 #endif
44
45 __device_inline float3 bvh_inverse_direction(float3 dir)
46 {
47         /* avoid divide by zero (ooeps = exp2f(-80.0f)) */
48         float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f;
49         float3 idir;
50
51         idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x));
52         idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y));
53         idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
54
55         return idir;
56 }
57
58 __device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
59 {
60         Transform tfm = object_fetch_transform(kg, object, ray->time, OBJECT_INVERSE_TRANSFORM);
61
62         *P = transform_point(&tfm, ray->P);
63
64         float3 dir = transform_direction(&tfm, ray->D);
65
66         float len;
67         dir = normalize_len(dir, &len);
68
69         *idir = bvh_inverse_direction(dir);
70
71         if(*t != FLT_MAX)
72                 *t *= len;
73 }
74
75 __device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
76 {
77         Transform tfm = object_fetch_transform(kg, object, ray->time, OBJECT_TRANSFORM);
78
79         if(*t != FLT_MAX)
80                 *t *= len(transform_direction(&tfm, 1.0f/(*idir)));
81
82         *P = ray->P;
83         *idir = bvh_inverse_direction(ray->D);
84 }
85
86 /* intersect two bounding boxes */
87 __device_inline void bvh_node_intersect(KernelGlobals *kg,
88         bool *traverseChild0, bool *traverseChild1,
89         bool *closestChild1, int *nodeAddr0, int *nodeAddr1,
90         float3 P, float3 idir, float t, uint visibility, int nodeAddr)
91 {
92         /* fetch node data */
93         float4 n0xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
94         float4 n1xy = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
95         float4 nz = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
96         float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
97
98         /* intersect ray against child nodes */
99         float3 ood = P * idir;
100         float c0lox = n0xy.x * idir.x - ood.x;
101         float c0hix = n0xy.y * idir.x - ood.x;
102         float c0loy = n0xy.z * idir.y - ood.y;
103         float c0hiy = n0xy.w * idir.y - ood.y;
104         float c0loz = nz.x * idir.z - ood.z;
105         float c0hiz = nz.y * idir.z - ood.z;
106         NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
107         NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
108
109         float c1loz = nz.z * idir.z - ood.z;
110         float c1hiz = nz.w * idir.z - ood.z;
111         float c1lox = n1xy.x * idir.x - ood.x;
112         float c1hix = n1xy.y * idir.x - ood.x;
113         float c1loy = n1xy.z * idir.y - ood.y;
114         float c1hiy = n1xy.w * idir.y - ood.y;
115         NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
116         NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
117
118         /* decide which nodes to traverse next */
119 #ifdef __VISIBILITY_FLAG__
120         /* this visibility test gives a 5% performance hit, how to solve? */
121         *traverseChild0 = (c0max >= c0min) && (__float_as_int(cnodes.z) & visibility);
122         *traverseChild1 = (c1max >= c1min) && (__float_as_int(cnodes.w) & visibility);
123 #else
124         *traverseChild0 = (c0max >= c0min);
125         *traverseChild1 = (c1max >= c1min);
126 #endif
127
128         *nodeAddr0 = __float_as_int(cnodes.x);
129         *nodeAddr1 = __float_as_int(cnodes.y);
130
131         *closestChild1 = (c1min < c0min);
132 }
133
134 /* Sven Woop's algorithm */
135 __device_inline void bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
136         float3 P, float3 idir, uint visibility, int object, int triAddr)
137 {
138         /* compute and check intersection t-value */
139         float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
140         float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
141         float3 dir = 1.0f/idir;
142
143         float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
144         float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
145         float t = Oz * invDz;
146
147         if(t > 0.0f && t < isect->t) {
148                 /* compute and check barycentric u */
149                 float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
150                 float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
151                 float u = Ox + t*Dx;
152
153                 if(u >= 0.0f) {
154                         /* compute and check barycentric v */
155                         float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
156                         float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
157                         float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
158                         float v = Oy + t*Dy;
159
160                         if(v >= 0.0f && u + v <= 1.0f) {
161 #ifdef __VISIBILITY_FLAG__
162                                 /* visibility flag test. we do it here under the assumption
163                                    that most triangles are culled by node flags */
164                                 if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
165 #endif
166                                 {
167                                         /* record intersection */
168                                         isect->prim = triAddr;
169                                         isect->object = object;
170                                         isect->u = u;
171                                         isect->v = v;
172                                         isect->t = t;
173                                 }
174                         }
175                 }
176         }
177 }
178
179 __device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
180 {
181         /* traversal stack in CUDA thread-local memory */
182         int traversalStack[BVH_STACK_SIZE];
183         traversalStack[0] = ENTRYPOINT_SENTINEL;
184
185         /* traversal variables in registers */
186         int stackPtr = 0;
187         int nodeAddr = kernel_data.bvh.root;
188
189         /* ray parameters in registers */
190         const float tmax = ray->t;
191         float3 P = ray->P;
192         float3 idir = bvh_inverse_direction(ray->D);
193         int object = ~0;
194
195         isect->t = tmax;
196         isect->object = ~0;
197         isect->prim = ~0;
198         isect->u = 0.0f;
199         isect->v = 0.0f;
200
201         /* traversal loop */
202         do {
203                 do
204                 {
205                         /* traverse internal nodes */
206                         while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
207                         {
208                                 bool traverseChild0, traverseChild1, closestChild1;
209                                 int nodeAddrChild1;
210
211                                 bvh_node_intersect(kg, &traverseChild0, &traverseChild1,
212                                         &closestChild1, &nodeAddr, &nodeAddrChild1,
213                                         P, idir, isect->t, visibility, nodeAddr);
214
215                                 if(traverseChild0 != traverseChild1) {
216                                         /* one child was intersected */
217                                         if(traverseChild1) {
218                                                 nodeAddr = nodeAddrChild1;
219                                         }
220                                 }
221                                 else {
222                                         if(!traverseChild0) {
223                                                 /* neither child was intersected */
224                                                 nodeAddr = traversalStack[stackPtr];
225                                                 --stackPtr;
226                                         }
227                                         else {
228                                                 /* both children were intersected, push the farther one */
229                                                 if(closestChild1) {
230                                                         int tmp = nodeAddr;
231                                                         nodeAddr = nodeAddrChild1;
232                                                         nodeAddrChild1 = tmp;
233                                                 }
234
235                                                 ++stackPtr;
236                                                 traversalStack[stackPtr] = nodeAddrChild1;
237                                         }
238                                 }
239                         }
240
241                         /* if node is leaf, fetch triangle list */
242                         if(nodeAddr < 0) {
243                                 float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
244                                 int primAddr = __float_as_int(leaf.x);
245
246 #ifdef __INSTANCING__
247                                 if(primAddr >= 0) {
248 #endif
249                                         int primAddr2 = __float_as_int(leaf.y);
250
251                                         /* pop */
252                                         nodeAddr = traversalStack[stackPtr];
253                                         --stackPtr;
254
255                                         /* triangle intersection */
256                                         while(primAddr < primAddr2) {
257                                                 /* intersect ray against triangle */
258                                                 bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
259
260                                                 /* shadow ray early termination */
261                                                 if(visibility == PATH_RAY_SHADOW_OPAQUE && isect->prim != ~0)
262                                                         return true;
263
264                                                 primAddr++;
265                                         }
266 #ifdef __INSTANCING__
267                                 }
268                                 else {
269                                         /* instance push */
270                                         object = kernel_tex_fetch(__prim_object, -primAddr-1);
271
272                                         bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
273
274                                         ++stackPtr;
275                                         traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
276
277                                         nodeAddr = kernel_tex_fetch(__object_node, object);
278                                 }
279 #endif
280                         }
281                 } while(nodeAddr != ENTRYPOINT_SENTINEL);
282
283 #ifdef __INSTANCING__
284                 if(stackPtr >= 0) {
285                         kernel_assert(object != ~0);
286
287                         /* instance pop */
288                         bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
289                         object = ~0;
290                         nodeAddr = traversalStack[stackPtr];
291                         --stackPtr;
292                 }
293 #endif
294         } while(nodeAddr != ENTRYPOINT_SENTINEL);
295
296         return (isect->prim != ~0);
297 }
298
299 __device_inline float3 ray_offset(float3 P, float3 Ng)
300 {
301 #ifdef __INTERSECTION_REFINE__
302         const float epsilon_f = 1e-5f;
303         const int epsilon_i = 32;
304
305         float3 res;
306
307         /* x component */
308         if(fabsf(P.x) < epsilon_f) {
309                 res.x = P.x + Ng.x*epsilon_f;
310         }
311         else {
312                 uint ix = __float_as_uint(P.x);
313                 ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
314                 res.x = __uint_as_float(ix);
315         }
316
317         /* y component */
318         if(fabsf(P.y) < epsilon_f) {
319                 res.y = P.y + Ng.y*epsilon_f;
320         }
321         else {
322                 uint iy = __float_as_uint(P.y);
323                 iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
324                 res.y = __uint_as_float(iy);
325         }
326
327         /* z component */
328         if(fabsf(P.z) < epsilon_f) {
329                 res.z = P.z + Ng.z*epsilon_f;
330         }
331         else {
332                 uint iz = __float_as_uint(P.z);
333                 iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
334                 res.z = __uint_as_float(iz);
335         }
336
337         return res;
338 #else
339         const float epsilon_f = 1e-4f;
340         return P + epsilon_f*Ng;
341 #endif
342 }
343
344 __device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
345 {
346         float3 P = ray->P;
347         float3 D = ray->D;
348         float t = isect->t;
349
350 #ifdef __INTERSECTION_REFINE__
351         if(isect->object != ~0) {
352 #ifdef __MOTION__
353                 Transform tfm = sd->ob_itfm;
354 #else
355                 Transform tfm = object_fetch_transform(kg, isect->object, ray->time, OBJECT_INVERSE_TRANSFORM);
356 #endif
357
358                 P = transform_point(&tfm, P);
359                 D = transform_direction(&tfm, D*t);
360                 D = normalize_len(D, &t);
361         }
362
363         P = P + D*t;
364
365         float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
366         float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
367         float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
368         float rt = Oz * invDz;
369
370         P = P + D*rt;
371
372         if(isect->object != ~0) {
373 #ifdef __MOTION__
374                 Transform tfm = sd->ob_tfm;
375 #else
376                 Transform tfm = object_fetch_transform(kg, isect->object, ray->time, OBJECT_TRANSFORM);
377 #endif
378
379                 P = transform_point(&tfm, P);
380         }
381
382         return P;
383 #else
384         return P + D*t;
385 #endif
386 }
387
388 CCL_NAMESPACE_END
389