Cysles: Avoid having ShaderData on the stack
authorSergey Sharybin <sergey.vfx@gmail.com>
Sun, 22 Nov 2015 10:00:29 +0000 (15:00 +0500)
committerSergey Sharybin <sergey.vfx@gmail.com>
Wed, 25 Nov 2015 08:01:22 +0000 (13:01 +0500)
This commit introduces a SSS-oriented intersection structure which is replacing
old logic of having separate arrays for just intersections and shader data and
encapsulates all the data needed for SSS evaluation.

This giver a huge stack memory saving on GPU. In own experiments it gave 25%
memory usage reduction on GTX560Ti (722MB vs. 946MB).

Unfortunately, this gave some performance loss of 20% which only happens on GPU.
This is perhaps due to different memory access pattern. Will be solved in the
future, hopefully.

Famous saying: won in memory - lost in time (which is also valid in other way
around).

intern/cycles/kernel/geom/geom_bvh.h
intern/cycles/kernel/geom/geom_bvh_subsurface.h
intern/cycles/kernel/geom/geom_motion_triangle.h
intern/cycles/kernel/geom/geom_qbvh_subsurface.h
intern/cycles/kernel/geom/geom_triangle_intersect.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_branched.h
intern/cycles/kernel/kernel_subsurface.h
intern/cycles/kernel/kernel_types.h

index 3d0d406dd0bf7ab12a020fcd1c5751d346e69010..cea505002e28db9013a28bdeb927fd8fab94c946 100644 (file)
@@ -255,38 +255,81 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, con
 }
 
 #ifdef __SUBSURFACE__
-ccl_device_intersect uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     SubsurfaceIntersection *ss_isect,
+                                                     int subsurface_object,
+                                                     uint *lcg_state,
+                                                     int max_hits)
 {
 #ifdef __OBJECT_MOTION__
        if(kernel_data.bvh.have_motion) {
 #ifdef __HAIR__
-               if(kernel_data.bvh.have_curves)
-                       return bvh_intersect_subsurface_hair_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+               if(kernel_data.bvh.have_curves) {
+                       return bvh_intersect_subsurface_hair_motion(kg,
+                                                                   ray,
+                                                                   ss_isect,
+                                                                   subsurface_object,
+                                                                   lcg_state,
+                                                                   max_hits);
+               }
 #endif /* __HAIR__ */
 
-               return bvh_intersect_subsurface_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+               return bvh_intersect_subsurface_motion(kg,
+                                                      ray,
+                                                      ss_isect,
+                                                      subsurface_object,
+                                                      lcg_state,
+                                                      max_hits);
        }
 #endif /* __OBJECT_MOTION__ */
 
-#ifdef __HAIR__ 
-       if(kernel_data.bvh.have_curves)
-               return bvh_intersect_subsurface_hair(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#ifdef __HAIR__
+       if(kernel_data.bvh.have_curves) {
+               return bvh_intersect_subsurface_hair(kg,
+                                                    ray,
+                                                    ss_isect,
+                                                    subsurface_object,
+                                                    lcg_state,
+                                                    max_hits);
+       }
 #endif /* __HAIR__ */
 
 #ifdef __KERNEL_CPU__
 
 #ifdef __INSTANCING__
-       if(kernel_data.bvh.have_instancing)
-               return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+       if(kernel_data.bvh.have_instancing) {
+               return bvh_intersect_subsurface_instancing(kg,
+                                                          ray,
+                                                          ss_isect,
+                                                          subsurface_object,
+                                                          lcg_state,
+                                                          max_hits);
+       }
 #endif /* __INSTANCING__ */
 
-       return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+       return bvh_intersect_subsurface(kg,
+                                       ray,
+                                       ss_isect,
+                                       subsurface_object,
+                                       lcg_state,
+                                       max_hits);
 #else /* __KERNEL_CPU__ */
 
 #ifdef __INSTANCING__
-       return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+       return bvh_intersect_subsurface_instancing(kg,
+                                                  ray,
+                                                  ss_isect,
+                                                  subsurface_object,
+                                                  lcg_state,
+                                                  max_hits);
 #else
-       return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+       return bvh_intersect_subsurface(kg,
+                                       ray,
+                                       ss_isect,
+                                       subsurface_object,
+                                       lcg_state,
+                                       max_hits);
 #endif /* __INSTANCING__ */
 
 #endif /* __KERNEL_CPU__ */
index a093b9b55aace7c9acc70c116f9454c12c5f8986..b9f1a46afb6103431452067cf993a31a513c8b7f 100644 (file)
@@ -30,9 +30,9 @@
  *
  */
 
-ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                             const Ray *ray,
-                                            Intersection *isect_array,
+                                            SubsurfaceIntersection *ss_isect,
                                             int subsurface_object,
                                             uint *lcg_state,
                                             int max_hits)
@@ -60,7 +60,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
        int object = OBJECT_NONE;
        float isect_t = ray->t;
 
-       uint num_hits = 0;
+       ss_isect->num_hits = 0;
 
 #if BVH_FEATURE(BVH_MOTION)
        Transform ob_itfm;
@@ -210,7 +210,15 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                                                uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
                                                                if(tri_object != subsurface_object)
                                                                        continue;
-                                                               triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+                                                               triangle_intersect_subsurface(kg,
+                                                                                             &isect_precalc,
+                                                                                             ss_isect,
+                                                                                             P,
+                                                                                             object,
+                                                                                             primAddr,
+                                                                                             isect_t,
+                                                                                             lcg_state,
+                                                                                             max_hits);
                                                        }
                                                        break;
                                                }
@@ -223,7 +231,16 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                                                uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
                                                                if(tri_object != subsurface_object)
                                                                        continue;
-                                                               motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+                                                               motion_triangle_intersect_subsurface(kg,
+                                                                                                    ss_isect,
+                                                                                                    P,
+                                                                                                    dir,
+                                                                                                    ray->time,
+                                                                                                    object,
+                                                                                                    primAddr,
+                                                                                                    isect_t,
+                                                                                                    lcg_state,
+                                                                                                    max_hits);
                                                        }
                                                        break;
                                                }
@@ -301,13 +318,11 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                }
 #endif  /* FEATURE(BVH_INSTANCING) */
        } while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-       return num_hits;
 }
 
-ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
-                                         Intersection *isect_array,
+                                         SubsurfaceIntersection *ss_isect,
                                          int subsurface_object,
                                          uint *lcg_state,
                                          int max_hits)
@@ -316,7 +331,7 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
        if(kernel_data.bvh.use_qbvh) {
                return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
                                                    ray,
-                                                   isect_array,
+                                                   ss_isect,
                                                    subsurface_object,
                                                    lcg_state,
                                                    max_hits);
@@ -327,7 +342,7 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
                kernel_assert(kernel_data.bvh.use_qbvh == false);
                return BVH_FUNCTION_FULL_NAME(BVH)(kg,
                                                   ray,
-                                                  isect_array,
+                                                  ss_isect,
                                                   subsurface_object,
                                                   lcg_state,
                                                   max_hits);
index 86f93f242a1da8cd5b1bbc04e828811daec104f3..a7b3f5cad282d2a43fa899845c39f593ff28502a 100644 (file)
@@ -358,8 +358,17 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection
  * multiple hits we pick a single random primitive as the intersection point. */
 
 #ifdef __SUBSURFACE__
-ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
-       float3 P, float3 dir, float time, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
+ccl_device_inline void motion_triangle_intersect_subsurface(
+        KernelGlobals *kg,
+        SubsurfaceIntersection *ss_isect,
+        float3 P,
+        float3 dir,
+        float time,
+        int object,
+        int triAddr,
+        float tmax,
+        uint *lcg_state,
+        int max_hits)
 {
        /* primitive index for vertex location lookup */
        int prim = kernel_tex_fetch(__prim_index, triAddr);
@@ -373,30 +382,34 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I
        float t, u, v;
 
        if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) {
-               (*num_hits)++;
+               ss_isect->num_hits++;
 
                int hit;
 
-               if(*num_hits <= max_hits) {
-                       hit = *num_hits - 1;
+               if(ss_isect->num_hits <= max_hits) {
+                       hit = ss_isect->num_hits - 1;
                }
                else {
                        /* reservoir sampling: if we are at the maximum number of
                         * hits, randomly replace element or skip it */
-                       hit = lcg_step_uint(lcg_state) % *num_hits;
+                       hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
 
                        if(hit >= max_hits)
                                return;
                }
 
                /* record intersection */
-               Intersection *isect = &isect_array[hit];
+               Intersection *isect = &ss_isect->hits[hit];
                isect->t = t;
                isect->u = u;
                isect->v = v;
                isect->prim = triAddr;
                isect->object = object;
                isect->type = PRIMITIVE_MOTION_TRIANGLE;
+
+               /* Record geometric normal. */
+               ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
+                                                   verts[2] - verts[0]));
        }
 }
 #endif
index fe231720cf73f395d441dd1c3aa3317cc2e106dc..98e1d27b79e232fe43364f270c6f4f36a22c63de 100644 (file)
@@ -26,9 +26,9 @@
  *
  */
 
-ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
-                                             Intersection *isect_array,
+                                             SubsurfaceIntersection *ss_isect,
                                              int subsurface_object,
                                              uint *lcg_state,
                                              int max_hits)
@@ -55,7 +55,8 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
        float3 idir = bvh_inverse_direction(dir);
        int object = OBJECT_NONE;
        float isect_t = ray->t;
-       uint num_hits = 0;
+
+       ss_isect->num_hits = 0;
 
 #if BVH_FEATURE(BVH_MOTION)
        Transform ob_itfm;
@@ -63,7 +64,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 #ifndef __KERNEL_SSE41__
        if(!isfinite(P.x)) {
-               return 0;
+               return;
        }
 #endif
 
@@ -226,7 +227,15 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                                                if(tri_object != subsurface_object) {
                                                                        continue;
                                                                }
-                                                               triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+                                                               triangle_intersect_subsurface(kg,
+                                                                                             &isect_precalc,
+                                                                                             ss_isect,
+                                                                                             P,
+                                                                                             object,
+                                                                                             primAddr,
+                                                                                             isect_t,
+                                                                                             lcg_state,
+                                                                                             max_hits);
                                                        }
                                                        break;
                                                }
@@ -240,7 +249,16 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                                                if(tri_object != subsurface_object) {
                                                                        continue;
                                                                }
-                                                               motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+                                                               motion_triangle_intersect_subsurface(kg,
+                                                                                                    ss_isect,
+                                                                                                    P,
+                                                                                                    dir,
+                                                                                                    ray->time,
+                                                                                                    object,
+                                                                                                    primAddr,
+                                                                                                    isect_t,
+                                                                                                    lcg_state,
+                                                                                                    max_hits);
                                                        }
                                                        break;
                                                }
@@ -321,6 +339,4 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                }
 #endif  /* FEATURE(BVH_INSTANCING) */
        } while(nodeAddr != ENTRYPOINT_SENTINEL);
-
-       return num_hits;
 }
index 970616eb894753d32202fdeac888cb4fc955c6df..d37e593005c2eadff11bdbfb15ddf64d017630e4 100644 (file)
@@ -204,12 +204,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 ccl_device_inline void triangle_intersect_subsurface(
         KernelGlobals *kg,
         const IsectPrecalc *isect_precalc,
-        Intersection *isect_array,
+        SubsurfaceIntersection *ss_isect,
         float3 P,
         int object,
         int triAddr,
         float tmax,
-        uint *num_hits,
         uint *lcg_state,
         int max_hits)
 {
@@ -272,29 +271,36 @@ ccl_device_inline void triangle_intersect_subsurface(
        /* Normalize U, V, W, and T. */
        const float inv_det = 1.0f / det;
 
-       (*num_hits)++;
+       ss_isect->num_hits++;
        int hit;
 
-       if(*num_hits <= max_hits) {
-               hit = *num_hits - 1;
+       if(ss_isect->num_hits <= max_hits) {
+               hit = ss_isect->num_hits - 1;
        }
        else {
                /* reservoir sampling: if we are at the maximum number of
                 * hits, randomly replace element or skip it */
-               hit = lcg_step_uint(lcg_state) % *num_hits;
+               hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
 
                if(hit >= max_hits)
                        return;
        }
 
        /* record intersection */
-       Intersection *isect = &isect_array[hit];
+       Intersection *isect = &ss_isect->hits[hit];
        isect->prim = triAddr;
        isect->object = object;
        isect->type = PRIMITIVE_TRIANGLE;
        isect->u = U * inv_det;
        isect->v = V * inv_det;
        isect->t = T * inv_det;
+
+       /* Record geometric normal. */
+       /* TODO(sergey): Use float4_to_float3() on just an edges. */
+       const float3 v0 = float4_to_float3(tri_a);
+       const float3 v1 = float4_to_float3(tri_b);
+       const float3 v2 = float4_to_float3(tri_c);
+       ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0));
 }
 #endif
 
index 9794ad1d1805e31e65232832cbbb112c5b148f24..87d36efa4d4eee527c43e3a585f8c6fd3839dae5 100644 (file)
@@ -338,10 +338,16 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
        if(sc) {
                uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
 
-               ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+               SubsurfaceIntersection ss_isect;
                float bssrdf_u, bssrdf_v;
                path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-               int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
+               int num_hits = subsurface_scatter_multi_intersect(kg,
+                                                                 &ss_isect,
+                                                                 sd,
+                                                                 sc,
+                                                                 &lcg_state,
+                                                                 bssrdf_u, bssrdf_v,
+                                                                 false);
 #ifdef __VOLUME__
                Ray volume_ray = *ray;
                bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
@@ -350,15 +356,26 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 
                /* compute lighting with the BSDF closure */
                for(int hit = 0; hit < num_hits; hit++) {
+                       /* NOTE: We reuse the existing ShaderData, we assume the path
+                        * integration loop stops when this function returns true.
+                        */
+                       subsurface_scatter_multi_setup(kg,
+                                                      &ss_isect,
+                                                      hit,
+                                                      sd,
+                                                      state->flag,
+                                                      sc,
+                                                      false);
+
                        float3 tp = *throughput;
                        PathState hit_state = *state;
                        Ray hit_ray = *ray;
 
                        hit_state.rng_offset += PRNG_BOUNCE_NUM;
-                       
-                       kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L);
 
-                       if(kernel_path_surface_bounce(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) {
+                       kernel_path_surface_connect_light(kg, rng, sd, tp, state, L);
+
+                       if(kernel_path_surface_bounce(kg, rng, sd, &tp, &hit_state, L, &hit_ray)) {
 #ifdef __LAMP_MIS__
                                hit_state.ray_t = 0.0f;
 #endif
index b6d64985f6ad725fcdd2aed82cab6bea482241cc..b6f95d6b0d26f7b97d4e6722c3531415c731cf32 100644 (file)
@@ -128,10 +128,16 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                /* do subsurface scatter step with copy of shader data, this will
                 * replace the BSSRDF with a diffuse BSDF closure */
                for(int j = 0; j < num_samples; j++) {
-                       ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+                       SubsurfaceIntersection ss_isect;
                        float bssrdf_u, bssrdf_v;
                        path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-                       int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
+                       int num_hits = subsurface_scatter_multi_intersect(kg,
+                                                                         &ss_isect,
+                                                                         sd,
+                                                                         sc,
+                                                                         &lcg_state,
+                                                                         bssrdf_u, bssrdf_v,
+                                                                         true);
 #ifdef __VOLUME__
                        Ray volume_ray = *ray;
                        bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
@@ -140,6 +146,15 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 
                        /* compute lighting with the BSDF closure */
                        for(int hit = 0; hit < num_hits; hit++) {
+                               ShaderData bssrdf_sd = *sd;
+                               subsurface_scatter_multi_setup(kg,
+                                                              &ss_isect,
+                                                              hit,
+                                                              &bssrdf_sd,
+                                                              state->flag,
+                                                              sc,
+                                                              true);
+
                                PathState hit_state = *state;
 
                                path_state_branch(&hit_state, j, num_samples);
@@ -147,7 +162,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 #ifdef __VOLUME__
                                if(need_update_volume_stack) {
                                        /* Setup ray from previous surface point to the new one. */
-                                       float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
+                                       float3 P = ray_offset(bssrdf_sd.P, -bssrdf_sd.Ng);
                                        volume_ray.D = normalize_len(P - volume_ray.P,
                                                                     &volume_ray.t);
 
@@ -165,15 +180,27 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                /* direct light */
                                if(kernel_data.integrator.use_direct_light) {
                                        bool all = kernel_data.integrator.sample_all_lights_direct;
-                                       kernel_branched_path_surface_connect_light(kg, rng,
-                                               &bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
+                                       kernel_branched_path_surface_connect_light(
+                                               kg,
+                                               rng,
+                                               &bssrdf_sd,
+                                               &hit_state,
+                                               throughput,
+                                               num_samples_inv,
+                                               L,
+                                               all);
                                }
 #endif
 
                                /* indirect light */
-                               kernel_branched_path_surface_indirect_light(kg, rng,
-                                       &bssrdf_sd[hit], throughput, num_samples_inv,
-                                       &hit_state, L);
+                               kernel_branched_path_surface_indirect_light(
+                                       kg,
+                                       rng,
+                                       &bssrdf_sd,
+                                       throughput,
+                                       num_samples_inv,
+                                       &hit_state,
+                                       L);
                        }
                }
        }
index 2da060c32a25ff23c19474a6be4d095bc8f0094f..b992856179172a4303578f9b210fbe4cab73886f 100644 (file)
@@ -179,19 +179,23 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent)
        return color;
 }
 
-ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *out_sd, ShaderData *in_sd, int state_flag, float3 *eval, float3 *N)
+ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int state_flag,
+                                           float3 *eval,
+                                           float3 *N)
 {
        /* average color and texture blur at outgoing point */
        float texture_blur;
-       float3 out_color = shader_bssrdf_sum(out_sd, NULL, &texture_blur);
+       float3 out_color = shader_bssrdf_sum(sd, NULL, &texture_blur);
 
        /* do we have bump mapping? */
-       bool bump = (out_sd->flag & SD_HAS_BSSRDF_BUMP) != 0;
+       bool bump = (sd->flag & SD_HAS_BSSRDF_BUMP) != 0;
 
        if(bump || texture_blur > 0.0f) {
                /* average color and normal at incoming point */
-               shader_eval_surface(kg, in_sd, 0.0f, state_flag, SHADER_CONTEXT_SSS);
-               float3 in_color = shader_bssrdf_sum(in_sd, (bump)? N: NULL, NULL);
+               shader_eval_surface(kg, sd, 0.0f, state_flag, SHADER_CONTEXT_SSS);
+               float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
 
                /* we simply divide out the average color and multiply with the average
                 * of the other one. we could try to do this per closure but it's quite
@@ -206,14 +210,23 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *out_sd
        }
 }
 
-/* subsurface scattering step, from a point on the surface to other nearby points on the same object */
-ccl_device int subsurface_scatter_multi_step(KernelGlobals *kg, ShaderData *sd, ShaderData bssrdf_sd[BSSRDF_MAX_HITS],
-       int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
+/* Subsurface scattering step, from a point on the surface to other
+ * nearby points on the same object.
+ */
+ccl_device int subsurface_scatter_multi_intersect(
+        KernelGlobals *kg,
+        SubsurfaceIntersection* ss_isect,
+        ShaderData *sd,
+        ShaderClosure *sc,
+        uint *lcg_state,
+        float disk_u,
+        float disk_v,
+        bool all)
 {
        /* pick random axis in local frame and point on disk */
        float3 disk_N, disk_T, disk_B;
        float pick_pdf_N, pick_pdf_T, pick_pdf_B;
-       
+
        disk_N = sd->Ng;
        make_orthonormals(disk_N, &disk_T, &disk_B);
 
@@ -259,70 +272,89 @@ ccl_device int subsurface_scatter_multi_step(KernelGlobals *kg, ShaderData *sd,
        float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B;
 
        /* create ray */
-       Ray ray;
-       ray.P = sd->P + disk_N*disk_height + disk_P;
-       ray.D = -disk_N;
-       ray.t = 2.0f*disk_height;
-       ray.dP = sd->dP;
-       ray.dD = differential3_zero();
-       ray.time = sd->time;
+       Ray *ray = &ss_isect->ray;
+       ray->P = sd->P + disk_N*disk_height + disk_P;
+       ray->D = -disk_N;
+       ray->t = 2.0f*disk_height;
+       ray->dP = sd->dP;
+       ray->dD = differential3_zero();
+       ray->time = sd->time;
 
        /* intersect with the same object. if multiple intersections are found it
         * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
-       Intersection isect[BSSRDF_MAX_HITS];
-       uint num_hits = scene_intersect_subsurface(kg, &ray, isect, sd->object, lcg_state, BSSRDF_MAX_HITS);
-
-       /* evaluate bssrdf */
-       float3 eval = make_float3(0.0f, 0.0f, 0.0f);
-       int num_eval_hits = min(num_hits, BSSRDF_MAX_HITS);
+       scene_intersect_subsurface(kg,
+                                  ray,
+                                  ss_isect,
+                                  sd->object,
+                                  lcg_state,
+                                  BSSRDF_MAX_HITS);
+       /* TODO(sergey): Investigate whether scene_intersect_subsurface() could
+        * indeed return more than BSSRDF_MAX_HITS hits.
+        */
+       int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
 
        for(int hit = 0; hit < num_eval_hits; hit++) {
-               ShaderData *bsd = &bssrdf_sd[hit];
-
-               /* setup new shading point */
-               *bsd = *sd;
-               shader_setup_from_subsurface(kg, bsd, &isect[hit], &ray);
+               /* Quickly retrieve P and Ng without setting up ShaderData. */
+               float3 hit_P = ray->P + ss_isect->hits[hit].t * ray->D;
+               float3 hit_Ng = ss_isect->Ng[hit];
+               if(ss_isect->hits[hit].object != OBJECT_NONE) {
+                       object_normal_transform(kg, sd, &hit_Ng);
+               }
 
                /* probability densities for local frame axes */
-               float pdf_N = pick_pdf_N * fabsf(dot(disk_N, bsd->Ng));
-               float pdf_T = pick_pdf_T * fabsf(dot(disk_T, bsd->Ng));
-               float pdf_B = pick_pdf_B * fabsf(dot(disk_B, bsd->Ng));
-               
+               float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
+               float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
+               float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
+
                /* multiple importance sample between 3 axes, power heuristic
                 * found to be slightly better than balance heuristic */
                float mis_weight = power_heuristic_3(pdf_N, pdf_T, pdf_B);
 
                /* real distance to sampled point */
-               float r = len(bsd->P - sd->P);
+               float r = len(hit_P - sd->P);
 
                /* evaluate */
                float w = mis_weight / pdf_N;
-               if(num_hits > BSSRDF_MAX_HITS)
-                       w *= num_hits/(float)BSSRDF_MAX_HITS;
-               eval = subsurface_scatter_eval(bsd, sc, disk_r, r, all) * w;
+               if(ss_isect->num_hits > BSSRDF_MAX_HITS)
+                       w *= ss_isect->num_hits/(float)BSSRDF_MAX_HITS;
+               float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
 
-               /* optionally blur colors and bump mapping */
-               float3 N = bsd->N;
-               subsurface_color_bump_blur(kg, sd, bsd, state_flag, &eval, &N);
-
-               /* setup diffuse bsdf */
-               subsurface_scatter_setup_diffuse_bsdf(bsd, eval, true, N);
+               ss_isect->weight[hit] = eval;
        }
 
        return num_eval_hits;
 }
 
+ccl_device void subsurface_scatter_multi_setup(KernelGlobals *kg,
+                                               SubsurfaceIntersection* ss_isect,
+                                               int hit,
+                                               ShaderData *sd,
+                                               int state_flag,
+                                               ShaderClosure *sc,
+                                               bool all)
+{
+       /* Setup new shading point. */
+       shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray);
+
+       /* Optionally blur colors and bump mapping. */
+       float3 weight = ss_isect->weight[hit];
+       float3 N = sd->N;
+       subsurface_color_bump_blur(kg, sd, state_flag, &weight, &N);
+
+       /* Setup diffuse BSDF. */
+       subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N);
+}
+
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
 ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
        int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
        float3 eval = make_float3(0.0f, 0.0f, 0.0f);
-       uint num_hits = 0;
 
        /* pick random axis in local frame and point on disk */
        float3 disk_N, disk_T, disk_B;
        float pick_pdf_N, pick_pdf_T, pick_pdf_B;
-       
+
        disk_N = sd->Ng;
        make_orthonormals(disk_N, &disk_T, &disk_B);
 
@@ -368,21 +400,21 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
 
        /* intersect with the same object. if multiple intersections are
         * found it will randomly pick one of them */
-       Intersection isect;
-       num_hits = scene_intersect_subsurface(kg, &ray, &isect, sd->object, lcg_state, 1);
+       SubsurfaceIntersection ss_isect;
+       scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1);
 
        /* evaluate bssrdf */
-       if(num_hits > 0) {
+       if(ss_isect.num_hits > 0) {
                float3 origP = sd->P;
 
                /* setup new shading point */
-               shader_setup_from_subsurface(kg, sd, &isect, &ray);
+               shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray);
 
                /* probability densities for local frame axes */
                float pdf_N = pick_pdf_N * fabsf(dot(disk_N, sd->Ng));
                float pdf_T = pick_pdf_T * fabsf(dot(disk_T, sd->Ng));
                float pdf_B = pick_pdf_B * fabsf(dot(disk_B, sd->Ng));
-               
+
                /* multiple importance sample between 3 axes, power heuristic
                 * found to be slightly better than balance heuristic */
                float mis_weight = power_heuristic_3(pdf_N, pdf_T, pdf_B);
@@ -391,16 +423,16 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
                float r = len(sd->P - origP);
 
                /* evaluate */
-               float w = (mis_weight * num_hits) / pdf_N;
+               float w = (mis_weight * ss_isect.num_hits) / pdf_N;
                eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
        }
 
        /* optionally blur colors and bump mapping */
        float3 N = sd->N;
-       subsurface_color_bump_blur(kg, sd, sd, state_flag, &eval, &N);
+       subsurface_color_bump_blur(kg, sd, state_flag, &eval, &N);
 
        /* setup diffuse bsdf */
-       subsurface_scatter_setup_diffuse_bsdf(sd, eval, (num_hits > 0), N);
+       subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N);
 }
 
 CCL_NAMESPACE_END
index 5ccbc1de94c84f02734568194f7dde03e7d5953e..e04f500e41069c7fdf1424804dbd09e15fb67ac2 100644 (file)
@@ -520,6 +520,18 @@ typedef ccl_addr_space struct Intersection {
 #endif
 } Intersection;
 
+/* Subsurface Intersection result */
+
+struct SubsurfaceIntersection
+{
+       Ray ray;
+       float3 weight[BSSRDF_MAX_HITS];
+
+       int num_hits;
+       struct Intersection hits[BSSRDF_MAX_HITS];
+       float3 Ng[BSSRDF_MAX_HITS];
+};
+
 /* Primitives */
 
 typedef enum PrimitiveType {