Fix Cycles CUDA performance on CUDA 8.0.
[blender-staging.git] / intern / cycles / kernel / kernel_shadow.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 CCL_NAMESPACE_BEGIN
18
19 #ifdef __SHADOW_RECORD_ALL__
20
21 /* Shadow function to compute how much light is blocked, CPU variation.
22  *
23  * We trace a single ray. If it hits any opaque surface, or more than a given
24  * number of transparent surfaces is hit, then we consider the geometry to be
25  * entirely blocked. If not, all transparent surfaces will be recorded and we
26  * will shade them one by one to determine how much light is blocked. This all
27  * happens in one scene intersection function.
28  *
29  * Recording all hits works well in some cases but may be slower in others. If
30  * we have many semi-transparent hairs, one intersection may be faster because
31  * you'd be reinteresecting the same hairs a lot with each step otherwise. If
32  * however there is mostly binary transparency then we may be recording many
33  * unnecessary intersections when one of the first surfaces blocks all light.
34  *
35  * From tests in real scenes it seems the performance loss is either minimal,
36  * or there is a performance increase anyway due to avoiding the need to send
37  * two rays with transparent shadows.
38  *
39  * This is CPU only because of qsort, and malloc or high stack space usage to
40  * record all these intersections. */
41
42 #define STACK_MAX_HITS 64
43
44 ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow)
45 {
46         *shadow = make_float3(1.0f, 1.0f, 1.0f);
47
48         if(ray->t == 0.0f)
49                 return false;
50         
51         bool blocked;
52
53         if(kernel_data.integrator.transparent_shadows) {
54                 /* check transparent bounces here, for volume scatter which can do
55                  * lighting before surface path termination is checked */
56                 if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
57                         return true;
58
59                 /* intersect to find an opaque surface, or record all transparent surface hits */
60                 Intersection hits_stack[STACK_MAX_HITS];
61                 Intersection *hits = hits_stack;
62                 const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
63                 uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
64
65                 /* prefer to use stack but use dynamic allocation if too deep max hits
66                  * we need max_hits + 1 storage space due to the logic in
67                  * scene_intersect_shadow_all which will first store and then check if
68                  * the limit is exceeded */
69                 if(max_hits + 1 > STACK_MAX_HITS) {
70                         if(kg->transparent_shadow_intersections == NULL) {
71                                 kg->transparent_shadow_intersections =
72                                     (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
73                         }
74                         hits = kg->transparent_shadow_intersections;
75                 }
76
77                 uint num_hits;
78                 blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
79
80                 /* if no opaque surface found but we did find transparent hits, shade them */
81                 if(!blocked && num_hits > 0) {
82                         float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
83                         float3 Pend = ray->P + ray->D*ray->t;
84                         float last_t = 0.0f;
85                         int bounce = state->transparent_bounce;
86                         Intersection *isect = hits;
87 #ifdef __VOLUME__
88                         PathState ps = *state;
89 #endif
90
91                         qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
92
93                         for(int hit = 0; hit < num_hits; hit++, isect++) {
94                                 /* adjust intersection distance for moving ray forward */
95                                 float new_t = isect->t;
96                                 isect->t -= last_t;
97
98                                 /* skip hit if we did not move forward, step by step raytracing
99                                  * would have skipped it as well then */
100                                 if(last_t == new_t)
101                                         continue;
102
103                                 last_t = new_t;
104
105 #ifdef __VOLUME__
106                                 /* attenuation between last surface and next surface */
107                                 if(ps.volume_stack[0].shader != SHADER_NONE) {
108                                         Ray segment_ray = *ray;
109                                         segment_ray.t = isect->t;
110                                         kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
111                                 }
112 #endif
113
114                                 /* setup shader data at surface */
115                                 shader_setup_from_ray(kg, shadow_sd, isect, ray);
116
117                                 /* attenuation from transparent surface */
118                                 if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
119                                         path_state_modify_bounce(state, true);
120                                         shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
121                                         path_state_modify_bounce(state, false);
122
123                                         throughput *= shader_bsdf_transparency(kg, shadow_sd);
124                                 }
125
126                                 /* stop if all light is blocked */
127                                 if(is_zero(throughput)) {
128                                         return true;
129                                 }
130
131                                 /* move ray forward */
132                                 ray->P = shadow_sd->P;
133                                 if(ray->t != FLT_MAX) {
134                                         ray->D = normalize_len(Pend - ray->P, &ray->t);
135                                 }
136
137 #ifdef __VOLUME__
138                                 /* exit/enter volume */
139                                 kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
140 #endif
141
142                                 bounce++;
143                         }
144
145 #ifdef __VOLUME__
146                         /* attenuation for last line segment towards light */
147                         if(ps.volume_stack[0].shader != SHADER_NONE)
148                                 kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
149 #endif
150
151                         *shadow = throughput;
152
153                         return is_zero(throughput);
154                 }
155         }
156         else {
157                 Intersection isect;
158                 blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
159         }
160
161 #ifdef __VOLUME__
162         if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
163                 /* apply attenuation from current volume shader */
164                 kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
165         }
166 #endif
167
168         return blocked;
169 }
170
171 #undef STACK_MAX_HITS
172
173 #else
174
175 /* Shadow function to compute how much light is blocked, GPU variation.
176  *
177  * Here we raytrace from one transparent surface to the next step by step.
178  * To minimize overhead in cases where we don't need transparent shadows, we
179  * first trace a regular shadow ray. We check if the hit primitive was
180  * potentially transparent, and only in that case start marching. this gives
181  * one extra ray cast for the cases were we do want transparency. */
182
183 ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
184                                         ShaderData *shadow_sd,
185                                         ccl_addr_space PathState *state,
186                                         ccl_addr_space Ray *ray_input,
187                                         float3 *shadow)
188 {
189         *shadow = make_float3(1.0f, 1.0f, 1.0f);
190
191         if(ray_input->t == 0.0f)
192                 return false;
193
194 #ifdef __SPLIT_KERNEL__
195         Ray private_ray = *ray_input;
196         Ray *ray = &private_ray;
197 #else
198         Ray *ray = ray_input;
199 #endif
200
201 #ifdef __SPLIT_KERNEL__
202         Intersection *isect = &kg->isect_shadow[SD_THREAD];
203 #else
204         Intersection isect_object;
205         Intersection *isect = &isect_object;
206 #endif
207
208         bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
209
210 #ifdef __TRANSPARENT_SHADOWS__
211         if(blocked && kernel_data.integrator.transparent_shadows) {
212                 if(shader_transparent_shadow(kg, isect)) {
213                         float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
214                         float3 Pend = ray->P + ray->D*ray->t;
215                         int bounce = state->transparent_bounce;
216 #ifdef __VOLUME__
217                         PathState ps = *state;
218 #endif
219
220                         for(;;) {
221                                 if(bounce >= kernel_data.integrator.transparent_max_bounce)
222                                         return true;
223
224                                 if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
225                                 {
226 #ifdef __VOLUME__
227                                         /* attenuation for last line segment towards light */
228                                         if(ps.volume_stack[0].shader != SHADER_NONE)
229                                                 kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
230 #endif
231
232                                         *shadow *= throughput;
233
234                                         return false;
235                                 }
236
237                                 if(!shader_transparent_shadow(kg, isect)) {
238                                         return true;
239                                 }
240
241 #ifdef __VOLUME__
242                                 /* attenuation between last surface and next surface */
243                                 if(ps.volume_stack[0].shader != SHADER_NONE) {
244                                         Ray segment_ray = *ray;
245                                         segment_ray.t = isect->t;
246                                         kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
247                                 }
248 #endif
249
250                                 /* setup shader data at surface */
251                                 shader_setup_from_ray(kg, shadow_sd, isect, ray);
252
253                                 /* attenuation from transparent surface */
254                                 if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) {
255                                         path_state_modify_bounce(state, true);
256                                         shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
257                                         path_state_modify_bounce(state, false);
258
259                                         throughput *= shader_bsdf_transparency(kg, shadow_sd);
260                                 }
261
262                                 /* stop if all light is blocked */
263                                 if(is_zero(throughput)) {
264                                         return true;
265                                 }
266
267                                 /* move ray forward */
268                                 ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
269                                 if(ray->t != FLT_MAX) {
270                                         ray->D = normalize_len(Pend - ray->P, &ray->t);
271                                 }
272
273 #ifdef __VOLUME__
274                                 /* exit/enter volume */
275                                 kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
276 #endif
277
278                                 bounce++;
279                         }
280                 }
281         }
282 #ifdef __VOLUME__
283         else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
284                 /* apply attenuation from current volume shader */
285                 kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
286         }
287 #endif
288 #endif
289
290         return blocked;
291 }
292
293 #endif
294
295 CCL_NAMESPACE_END
296