Merge branch 'master' into blender2.8
[blender.git] / intern / cycles / bvh / bvh_build.cpp
1 /*
2  * Adapted from code copyright 2009-2010 NVIDIA Corporation
3  * Modifications Copyright 2011, Blender Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 #include "bvh/bvh_build.h"
19
20 #include "bvh/bvh_binning.h"
21 #include "bvh/bvh_node.h"
22 #include "bvh/bvh_params.h"
23 #include "bvh_split.h"
24
25 #include "render/mesh.h"
26 #include "render/object.h"
27 #include "render/scene.h"
28 #include "render/curves.h"
29
30 #include "util/util_algorithm.h"
31 #include "util/util_debug.h"
32 #include "util/util_foreach.h"
33 #include "util/util_logging.h"
34 #include "util/util_progress.h"
35 #include "util/util_stack_allocator.h"
36 #include "util/util_simd.h"
37 #include "util/util_time.h"
38 #include "util/util_queue.h"
39
40 CCL_NAMESPACE_BEGIN
41
42 /* BVH Build Task */
43
44 class BVHBuildTask : public Task {
45 public:
46         BVHBuildTask(BVHBuild *build,
47                      InnerNode *node,
48                      int child,
49                      const BVHObjectBinning& range,
50                      int level)
51         : range_(range)
52         {
53                 run = function_bind(&BVHBuild::thread_build_node,
54                                     build,
55                                     node,
56                                     child,
57                                     &range_,
58                                     level);
59         }
60 private:
61         BVHObjectBinning range_;
62 };
63
64 class BVHSpatialSplitBuildTask : public Task {
65 public:
66         BVHSpatialSplitBuildTask(BVHBuild *build,
67                                  InnerNode *node,
68                                  int child,
69                                  const BVHRange& range,
70                                  const vector<BVHReference>& references,
71                                  int level)
72         : range_(range),
73           references_(references.begin() + range.start(),
74                       references.begin() + range.end())
75         {
76                 range_.set_start(0);
77                 run = function_bind(&BVHBuild::thread_build_spatial_split_node,
78                                     build,
79                                     node,
80                                     child,
81                                     &range_,
82                                     &references_,
83                                     level,
84                                     _1);
85         }
86 private:
87         BVHRange range_;
88         vector<BVHReference> references_;
89 };
90
91 /* Constructor / Destructor */
92
93 BVHBuild::BVHBuild(const vector<Object*>& objects_,
94                    array<int>& prim_type_,
95                    array<int>& prim_index_,
96                    array<int>& prim_object_,
97                    array<float2>& prim_time_,
98                    const BVHParams& params_,
99                    Progress& progress_)
100  : objects(objects_),
101    prim_type(prim_type_),
102    prim_index(prim_index_),
103    prim_object(prim_object_),
104    prim_time(prim_time_),
105    params(params_),
106    progress(progress_),
107    progress_start_time(0.0),
108    unaligned_heuristic(objects_)
109 {
110         spatial_min_overlap = 0.0f;
111 }
112
113 BVHBuild::~BVHBuild()
114 {
115 }
116
117 /* Adding References */
118
119 void BVHBuild::add_reference_triangles(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
120 {
121         const Attribute *attr_mP = NULL;
122         if(mesh->has_motion_blur()) {
123                 attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
124         }
125         const size_t num_triangles = mesh->num_triangles();
126         for(uint j = 0; j < num_triangles; j++) {
127                 Mesh::Triangle t = mesh->get_triangle(j);
128                 const float3 *verts = &mesh->verts[0];
129                 if(attr_mP == NULL) {
130                         BoundBox bounds = BoundBox::empty;
131                         t.bounds_grow(verts, bounds);
132                         if(bounds.valid()) {
133                                 references.push_back(BVHReference(bounds,
134                                                                   j,
135                                                                   i,
136                                                                   PRIMITIVE_TRIANGLE));
137                                 root.grow(bounds);
138                                 center.grow(bounds.center2());
139                         }
140                 }
141                 else if(params.num_motion_triangle_steps == 0 || params.use_spatial_split) {
142                         /* Motion triangles, simple case: single node for the whole
143                          * primitive. Lowest memory footprint and faster BVH build but
144                          * least optimal ray-tracing.
145                          */
146                         /* TODO(sergey): Support motion steps for spatially split BVH. */
147                         const size_t num_verts = mesh->verts.size();
148                         const size_t num_steps = mesh->motion_steps;
149                         const float3 *vert_steps = attr_mP->data_float3();
150                         BoundBox bounds = BoundBox::empty;
151                         t.bounds_grow(verts, bounds);
152                         for(size_t step = 0; step < num_steps - 1; step++) {
153                                 t.bounds_grow(vert_steps + step*num_verts, bounds);
154                         }
155                         if(bounds.valid()) {
156                                 references.push_back(
157                                         BVHReference(bounds,
158                                                      j,
159                                                      i,
160                                                      PRIMITIVE_MOTION_TRIANGLE));
161                                 root.grow(bounds);
162                                 center.grow(bounds.center2());
163                         }
164                 }
165                 else {
166                         /* Motion triangles, trace optimized case:  we split triangle
167                          * primitives into separate nodes for each of the time steps.
168                          * This way we minimize overlap of neighbor curve primitives.
169                          */
170                         const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
171                         const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
172                         const size_t num_verts = mesh->verts.size();
173                         const size_t num_steps = mesh->motion_steps;
174                         const float3 *vert_steps = attr_mP->data_float3();
175                         /* Calculate bounding box of the previous time step.
176                          * Will be reused later to avoid duplicated work on
177                          * calculating BVH time step boundbox.
178                          */
179                         float3 prev_verts[3];
180                         t.motion_verts(verts,
181                                        vert_steps,
182                                        num_verts,
183                                        num_steps,
184                                        0.0f,
185                                        prev_verts);
186                         BoundBox prev_bounds = BoundBox::empty;
187                         prev_bounds.grow(prev_verts[0]);
188                         prev_bounds.grow(prev_verts[1]);
189                         prev_bounds.grow(prev_verts[2]);
190                         /* Create all primitive time steps, */
191                         for(int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
192                                 const float curr_time = (float)(bvh_step) * num_bvh_steps_inv_1;
193                                 float3 curr_verts[3];
194                                 t.motion_verts(verts,
195                                                vert_steps,
196                                                num_verts,
197                                                num_steps,
198                                                curr_time,
199                                                curr_verts);
200                                 BoundBox curr_bounds = BoundBox::empty;
201                                 curr_bounds.grow(curr_verts[0]);
202                                 curr_bounds.grow(curr_verts[1]);
203                                 curr_bounds.grow(curr_verts[2]);
204                                 BoundBox bounds = prev_bounds;
205                                 bounds.grow(curr_bounds);
206                                 if(bounds.valid()) {
207                                         const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
208                                         references.push_back(
209                                                 BVHReference(bounds,
210                                                              j,
211                                                              i,
212                                                              PRIMITIVE_MOTION_TRIANGLE,
213                                                              prev_time,
214                                                              curr_time));
215                                         root.grow(bounds);
216                                         center.grow(bounds.center2());
217                                 }
218                                 /* Current time boundbox becomes previous one for the
219                                  * next time step.
220                                  */
221                                 prev_bounds = curr_bounds;
222                         }
223                 }
224         }
225 }
226
227 void BVHBuild::add_reference_curves(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
228 {
229         const Attribute *curve_attr_mP = NULL;
230         if(mesh->has_motion_blur()) {
231                 curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
232         }
233         const size_t num_curves = mesh->num_curves();
234         for(uint j = 0; j < num_curves; j++) {
235                 const Mesh::Curve curve = mesh->get_curve(j);
236                 const float *curve_radius = &mesh->curve_radius[0];
237                 for(int k = 0; k < curve.num_keys - 1; k++) {
238                         if(curve_attr_mP == NULL) {
239                                 /* Really simple logic for static hair. */
240                                 BoundBox bounds = BoundBox::empty;
241                                 curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
242                                 if(bounds.valid()) {
243                                         int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_CURVE, k);
244                                         references.push_back(BVHReference(bounds, j, i, packed_type));
245                                         root.grow(bounds);
246                                         center.grow(bounds.center2());
247                                 }
248                         }
249                         else if(params.num_motion_curve_steps == 0 || params.use_spatial_split) {
250                                 /* Simple case of motion curves: single node for the while
251                                  * shutter time. Lowest memory usage but less optimal
252                                  * rendering.
253                                  */
254                                 /* TODO(sergey): Support motion steps for spatially split BVH. */
255                                 BoundBox bounds = BoundBox::empty;
256                                 curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
257                                 const size_t num_keys = mesh->curve_keys.size();
258                                 const size_t num_steps = mesh->motion_steps;
259                                 const float3 *key_steps = curve_attr_mP->data_float3();
260                                 for(size_t step = 0; step < num_steps - 1; step++) {
261                                         curve.bounds_grow(k,
262                                                           key_steps + step*num_keys,
263                                                           curve_radius,
264                                                           bounds);
265                                 }
266                                 if(bounds.valid()) {
267                                         int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
268                                         references.push_back(BVHReference(bounds,
269                                                                           j,
270                                                                           i,
271                                                                           packed_type));
272                                         root.grow(bounds);
273                                         center.grow(bounds.center2());
274                                 }
275                         }
276                         else {
277                                 /* Motion curves, trace optimized case:  we split curve keys
278                                  * primitives into separate nodes for each of the time steps.
279                                  * This way we minimize overlap of neighbor curve primitives.
280                                  */
281                                 const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
282                                 const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
283                                 const size_t num_steps = mesh->motion_steps;
284                                 const float3 *curve_keys = &mesh->curve_keys[0];
285                                 const float3 *key_steps = curve_attr_mP->data_float3();
286                                 const size_t num_keys = mesh->curve_keys.size();
287                                 /* Calculate bounding box of the previous time step.
288                                  * Will be reused later to avoid duplicated work on
289                                  * calculating BVH time step boundbox.
290                                  */
291                                 float4 prev_keys[4];
292                                 curve.cardinal_motion_keys(curve_keys,
293                                                            curve_radius,
294                                                            key_steps,
295                                                            num_keys,
296                                                            num_steps,
297                                                            0.0f,
298                                                            k - 1, k, k + 1, k + 2,
299                                                            prev_keys);
300                                 BoundBox prev_bounds = BoundBox::empty;
301                                 curve.bounds_grow(prev_keys, prev_bounds);
302                                 /* Create all primitive time steps, */
303                                 for(int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
304                                         const float curr_time = (float)(bvh_step) * num_bvh_steps_inv_1;
305                                         float4 curr_keys[4];
306                                         curve.cardinal_motion_keys(curve_keys,
307                                                                    curve_radius,
308                                                                    key_steps,
309                                                                    num_keys,
310                                                                    num_steps,
311                                                                    curr_time,
312                                                                    k - 1, k, k + 1, k + 2,
313                                                                    curr_keys);
314                                         BoundBox curr_bounds = BoundBox::empty;
315                                         curve.bounds_grow(curr_keys, curr_bounds);
316                                         BoundBox bounds = prev_bounds;
317                                         bounds.grow(curr_bounds);
318                                         if(bounds.valid()) {
319                                                 const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
320                                                 int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
321                                                 references.push_back(BVHReference(bounds,
322                                                                                   j,
323                                                                                   i,
324                                                                                   packed_type,
325                                                                                   prev_time,
326                                                                                   curr_time));
327                                                 root.grow(bounds);
328                                                 center.grow(bounds.center2());
329                                         }
330                                         /* Current time boundbox becomes previous one for the
331                                          * next time step.
332                                          */
333                                         prev_bounds = curr_bounds;
334                                 }
335                         }
336                 }
337         }
338 }
339
340 void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
341 {
342         if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
343                 add_reference_triangles(root, center, mesh, i);
344         }
345         if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
346                 add_reference_curves(root, center, mesh, i);
347         }
348 }
349
350 void BVHBuild::add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i)
351 {
352         references.push_back(BVHReference(ob->bounds, -1, i, 0));
353         root.grow(ob->bounds);
354         center.grow(ob->bounds.center2());
355 }
356
357 static size_t count_curve_segments(Mesh *mesh)
358 {
359         size_t num = 0, num_curves = mesh->num_curves();
360
361         for(size_t i = 0; i < num_curves; i++)
362                 num += mesh->get_curve(i).num_keys - 1;
363
364         return num;
365 }
366
367 void BVHBuild::add_references(BVHRange& root)
368 {
369         /* reserve space for references */
370         size_t num_alloc_references = 0;
371
372         foreach(Object *ob, objects) {
373                 if(params.top_level) {
374                         if(!ob->is_traceable()) {
375                                 continue;
376                         }
377                         if(!ob->mesh->is_instanced()) {
378                                 if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
379                                         num_alloc_references += ob->mesh->num_triangles();
380                                 }
381                                 if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
382                                         num_alloc_references += count_curve_segments(ob->mesh);
383                                 }
384                         }
385                         else
386                                 num_alloc_references++;
387                 }
388                 else {
389                         if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
390                                 num_alloc_references += ob->mesh->num_triangles();
391                         }
392                         if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
393                                 num_alloc_references += count_curve_segments(ob->mesh);
394                         }
395                 }
396         }
397
398         references.reserve(num_alloc_references);
399
400         /* add references from objects */
401         BoundBox bounds = BoundBox::empty, center = BoundBox::empty;
402         int i = 0;
403
404         foreach(Object *ob, objects) {
405                 if(params.top_level) {
406                         if(!ob->is_traceable()) {
407                                 ++i;
408                                 continue;
409                         }
410                         if(!ob->mesh->is_instanced())
411                                 add_reference_mesh(bounds, center, ob->mesh, i);
412                         else
413                                 add_reference_object(bounds, center, ob, i);
414                 }
415                 else
416                         add_reference_mesh(bounds, center, ob->mesh, i);
417
418                 i++;
419
420                 if(progress.get_cancel()) return;
421         }
422
423         /* happens mostly on empty meshes */
424         if(!bounds.valid())
425                 bounds.grow(make_float3(0.0f, 0.0f, 0.0f));
426
427         root = BVHRange(bounds, center, 0, references.size());
428 }
429
430 /* Build */
431
432 BVHNode* BVHBuild::run()
433 {
434         BVHRange root;
435
436         /* add references */
437         add_references(root);
438
439         if(progress.get_cancel())
440                 return NULL;
441
442         /* init spatial splits */
443         if(params.top_level) {
444                 /* NOTE: Technically it is supported by the builder but it's not really
445                  * optimized for speed yet and not really clear yet if it has measurable
446                  * improvement on render time. Needs some extra investigation before
447                  * enabling spatial split for top level BVH.
448                  */
449                 params.use_spatial_split = false;
450         }
451
452         spatial_min_overlap = root.bounds().safe_area() * params.spatial_split_alpha;
453         if(params.use_spatial_split) {
454                 /* NOTE: The API here tries to be as much ready for multi-threaded build
455                  * as possible, but at the same time it tries not to introduce any
456                  * changes in behavior for until all refactoring needed for threading is
457                  * finished.
458                  *
459                  * So we currently allocate single storage for now, which is only used by
460                  * the only thread working on the spatial BVH build.
461                  */
462                 spatial_storage.resize(TaskScheduler::num_threads() + 1);
463                 size_t num_bins = max(root.size(), (int)BVHParams::NUM_SPATIAL_BINS) - 1;
464                 foreach(BVHSpatialStorage &storage, spatial_storage) {
465                         storage.right_bounds.clear();
466                 }
467                 spatial_storage[0].right_bounds.resize(num_bins);
468         }
469         spatial_free_index = 0;
470
471         need_prim_time = params.num_motion_curve_steps > 0 ||
472                          params.num_motion_triangle_steps > 0;
473
474         /* init progress updates */
475         double build_start_time;
476         build_start_time = progress_start_time = time_dt();
477         progress_count = 0;
478         progress_total = references.size();
479         progress_original_total = progress_total;
480
481         prim_type.resize(references.size());
482         prim_index.resize(references.size());
483         prim_object.resize(references.size());
484         if(need_prim_time) {
485                 prim_time.resize(references.size());
486         }
487         else {
488                 prim_time.resize(0);
489         }
490
491         /* build recursively */
492         BVHNode *rootnode;
493
494         if(params.use_spatial_split) {
495                 /* Perform multithreaded spatial split build. */
496                 rootnode = build_node(root, &references, 0, 0);
497                 task_pool.wait_work();
498         }
499         else {
500                 /* Perform multithreaded binning build. */
501                 BVHObjectBinning rootbin(root, (references.size())? &references[0]: NULL);
502                 rootnode = build_node(rootbin, 0);
503                 task_pool.wait_work();
504         }
505
506         /* delete if we canceled */
507         if(rootnode) {
508                 if(progress.get_cancel()) {
509                         rootnode->deleteSubtree();
510                         rootnode = NULL;
511                         VLOG(1) << "BVH build cancelled.";
512                 }
513                 else {
514                         /*rotate(rootnode, 4, 5);*/
515                         rootnode->update_visibility();
516                         rootnode->update_time();
517                 }
518                 if(rootnode != NULL) {
519                         VLOG(1) << "BVH build statistics:\n"
520                                 << "  Build time: " << time_dt() - build_start_time << "\n"
521                                 << "  Total number of nodes: "
522                                 << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_NODE_COUNT)) << "\n"
523                                 << "  Number of inner nodes: "
524                                 << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_INNER_COUNT)) << "\n"
525                                 << "  Number of leaf nodes: "
526                                 << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_LEAF_COUNT)) << "\n"
527                                 << "  Number of unaligned nodes: "
528                                 << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_UNALIGNED_COUNT))  << "\n"
529                                 << "  Allocation slop factor: "
530                                        << ((prim_type.capacity() != 0)
531                                                ? (float)prim_type.size() / prim_type.capacity()
532                                                : 1.0f) << "\n";
533                 }
534         }
535
536
537         return rootnode;
538 }
539
540 void BVHBuild::progress_update()
541 {
542         if(time_dt() - progress_start_time < 0.25)
543                 return;
544
545         double progress_start = (double)progress_count/(double)progress_total;
546         double duplicates = (double)(progress_total - progress_original_total)/(double)progress_total;
547
548         string msg = string_printf("Building BVH %.0f%%, duplicates %.0f%%",
549                                    progress_start * 100.0, duplicates * 100.0);
550
551         progress.set_substatus(msg);
552         progress_start_time = time_dt();
553 }
554
555 void BVHBuild::thread_build_node(InnerNode *inner,
556                                  int child,
557                                  BVHObjectBinning *range,
558                                  int level)
559 {
560         if(progress.get_cancel())
561                 return;
562
563         /* build nodes */
564         BVHNode *node = build_node(*range, level);
565
566         /* set child in inner node */
567         inner->children[child] = node;
568
569         /* update progress */
570         if(range->size() < THREAD_TASK_SIZE) {
571                 /*rotate(node, INT_MAX, 5);*/
572
573                 thread_scoped_lock lock(build_mutex);
574
575                 progress_count += range->size();
576                 progress_update();
577         }
578 }
579
580 void BVHBuild::thread_build_spatial_split_node(InnerNode *inner,
581                                                int child,
582                                                BVHRange *range,
583                                                vector<BVHReference> *references,
584                                                int level,
585                                                int thread_id)
586 {
587         if(progress.get_cancel()) {
588                 return;
589         }
590
591         /* build nodes */
592         BVHNode *node = build_node(*range, references, level, thread_id);
593
594         /* set child in inner node */
595         inner->children[child] = node;
596 }
597
598 bool BVHBuild::range_within_max_leaf_size(const BVHRange& range,
599                                           const vector<BVHReference>& references) const
600 {
601         size_t size = range.size();
602         size_t max_leaf_size = max(params.max_triangle_leaf_size, params.max_curve_leaf_size);
603
604         if(size > max_leaf_size)
605                 return false;
606
607         size_t num_triangles = 0;
608         size_t num_motion_triangles = 0;
609         size_t num_curves = 0;
610         size_t num_motion_curves = 0;
611
612         for(int i = 0; i < size; i++) {
613                 const BVHReference& ref = references[range.start() + i];
614
615                 if(ref.prim_type() & PRIMITIVE_CURVE)
616                         num_curves++;
617                 if(ref.prim_type() & PRIMITIVE_MOTION_CURVE)
618                         num_motion_curves++;
619                 else if(ref.prim_type() & PRIMITIVE_TRIANGLE)
620                         num_triangles++;
621                 else if(ref.prim_type() & PRIMITIVE_MOTION_TRIANGLE)
622                         num_motion_triangles++;
623         }
624
625         return (num_triangles <= params.max_triangle_leaf_size) &&
626                (num_motion_triangles <= params.max_motion_triangle_leaf_size) &&
627                (num_curves <= params.max_curve_leaf_size) &&
628                (num_motion_curves <= params.max_motion_curve_leaf_size);
629 }
630
631 /* multithreaded binning builder */
632 BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level)
633 {
634         size_t size = range.size();
635         float leafSAH = params.sah_primitive_cost * range.leafSAH;
636         float splitSAH = params.sah_node_cost * range.bounds().half_area() + params.sah_primitive_cost * range.splitSAH;
637
638         /* Have at least one inner node on top level, for performance and correct
639          * visibility tests, since object instances do not check visibility flag.
640          */
641         if(!(range.size() > 0 && params.top_level && level == 0)) {
642                 /* Make leaf node when threshold reached or SAH tells us. */
643                 if((params.small_enough_for_leaf(size, level)) ||
644                    (range_within_max_leaf_size(range, references) && leafSAH < splitSAH))
645                 {
646                         return create_leaf_node(range, references);
647                 }
648         }
649
650         BVHObjectBinning unaligned_range;
651         float unalignedSplitSAH = FLT_MAX;
652         float unalignedLeafSAH = FLT_MAX;
653         Transform aligned_space;
654         bool do_unalinged_split = false;
655         if(params.use_unaligned_nodes &&
656            splitSAH > params.unaligned_split_threshold*leafSAH)
657         {
658                 aligned_space = unaligned_heuristic.compute_aligned_space(
659                         range, &references[0]);
660                 unaligned_range = BVHObjectBinning(range,
661                                                    &references[0],
662                                                    &unaligned_heuristic,
663                                                    &aligned_space);
664                 unalignedSplitSAH = params.sah_node_cost * unaligned_range.unaligned_bounds().half_area() +
665                                     params.sah_primitive_cost * unaligned_range.splitSAH;
666                 unalignedLeafSAH = params.sah_primitive_cost * unaligned_range.leafSAH;
667                 if(!(range.size() > 0 && params.top_level && level == 0)) {
668                         if(unalignedLeafSAH < unalignedSplitSAH && unalignedSplitSAH < splitSAH &&
669                            range_within_max_leaf_size(range, references))
670                         {
671                                 return create_leaf_node(range, references);
672                         }
673                 }
674                 /* Check whether unaligned split is better than the regulat one. */
675                 if(unalignedSplitSAH < splitSAH) {
676                         do_unalinged_split = true;
677                 }
678         }
679
680         /* Perform split. */
681         BVHObjectBinning left, right;
682         if(do_unalinged_split) {
683                 unaligned_range.split(&references[0], left, right);
684         }
685         else {
686                 range.split(&references[0], left, right);
687         }
688
689         BoundBox bounds;
690         if(do_unalinged_split) {
691                 bounds = unaligned_heuristic.compute_aligned_boundbox(
692                         range, &references[0], aligned_space);
693         }
694         else {
695                 bounds = range.bounds();
696         }
697
698         /* Create inner node. */
699         InnerNode *inner;
700         if(range.size() < THREAD_TASK_SIZE) {
701                 /* local build */
702                 BVHNode *leftnode = build_node(left, level + 1);
703                 BVHNode *rightnode = build_node(right, level + 1);
704
705                 inner = new InnerNode(bounds, leftnode, rightnode);
706         }
707         else {
708                 /* Threaded build */
709                 inner = new InnerNode(bounds);
710
711                 task_pool.push(new BVHBuildTask(this, inner, 0, left, level + 1), true);
712                 task_pool.push(new BVHBuildTask(this, inner, 1, right, level + 1), true);
713         }
714
715         if(do_unalinged_split) {
716                 inner->set_aligned_space(aligned_space);
717         }
718
719         return inner;
720 }
721
722 /* multithreaded spatial split builder */
723 BVHNode* BVHBuild::build_node(const BVHRange& range,
724                               vector<BVHReference> *references,
725                               int level,
726                               int thread_id)
727 {
728         /* Update progress.
729          *
730          * TODO(sergey): Currently it matches old behavior, but we can move it to the
731          * task thread (which will mimic non=split builder) and save some CPU ticks
732          * on checking cancel status.
733          */
734         progress_update();
735         if(progress.get_cancel()) {
736                 return NULL;
737         }
738
739         /* Small enough or too deep => create leaf. */
740         if(!(range.size() > 0 && params.top_level && level == 0)) {
741                 if(params.small_enough_for_leaf(range.size(), level)) {
742                         progress_count += range.size();
743                         return create_leaf_node(range, *references);
744                 }
745         }
746
747         /* Perform splitting test. */
748         BVHSpatialStorage *storage = &spatial_storage[thread_id];
749         BVHMixedSplit split(this, storage, range, references, level);
750
751         if(!(range.size() > 0 && params.top_level && level == 0)) {
752                 if(split.no_split) {
753                         progress_count += range.size();
754                         return create_leaf_node(range, *references);
755                 }
756         }
757         float leafSAH = params.sah_primitive_cost * split.leafSAH;
758         float splitSAH = params.sah_node_cost * range.bounds().half_area() +
759                          params.sah_primitive_cost * split.nodeSAH;
760
761         BVHMixedSplit unaligned_split;
762         float unalignedSplitSAH = FLT_MAX;
763         /* float unalignedLeafSAH = FLT_MAX; */
764         Transform aligned_space;
765         bool do_unalinged_split = false;
766         if(params.use_unaligned_nodes &&
767            splitSAH > params.unaligned_split_threshold*leafSAH)
768         {
769                 aligned_space =
770                         unaligned_heuristic.compute_aligned_space(range, &references->at(0));
771                 unaligned_split = BVHMixedSplit(this,
772                                                 storage,
773                                                 range,
774                                                 references,
775                                                 level,
776                                                 &unaligned_heuristic,
777                                                 &aligned_space);
778                 /* unalignedLeafSAH = params.sah_primitive_cost * split.leafSAH; */
779                 unalignedSplitSAH = params.sah_node_cost * unaligned_split.bounds.half_area() +
780                                     params.sah_primitive_cost * unaligned_split.nodeSAH;
781                 /* TOOD(sergey): Check we can create leaf already. */
782                 /* Check whether unaligned split is better than the regulat one. */
783                 if(unalignedSplitSAH < splitSAH) {
784                         do_unalinged_split = true;
785                 }
786         }
787
788         /* Do split. */
789         BVHRange left, right;
790         if(do_unalinged_split) {
791                 unaligned_split.split(this, left, right, range);
792         }
793         else {
794                 split.split(this, left, right, range);
795         }
796
797         progress_total += left.size() + right.size() - range.size();
798
799         BoundBox bounds;
800         if(do_unalinged_split) {
801                 bounds = unaligned_heuristic.compute_aligned_boundbox(
802                         range, &references->at(0), aligned_space);
803         }
804         else {
805                 bounds = range.bounds();
806         }
807
808         /* Create inner node. */
809         InnerNode *inner;
810         if(range.size() < THREAD_TASK_SIZE) {
811                 /* Local build. */
812
813                 /* Build left node. */
814                 vector<BVHReference> copy(references->begin() + right.start(),
815                                           references->begin() + right.end());
816                 right.set_start(0);
817
818                 BVHNode *leftnode = build_node(left, references, level + 1, thread_id);
819
820                 /* Build right node. */
821                 BVHNode *rightnode = build_node(right, &copy, level + 1, thread_id);
822
823                 inner = new InnerNode(bounds, leftnode, rightnode);
824         }
825         else {
826                 /* Threaded build. */
827                 inner = new InnerNode(bounds);
828                 task_pool.push(new BVHSpatialSplitBuildTask(this,
829                                                             inner,
830                                                             0,
831                                                             left,
832                                                             *references,
833                                                             level + 1),
834                                true);
835                 task_pool.push(new BVHSpatialSplitBuildTask(this,
836                                                             inner,
837                                                             1,
838                                                             right,
839                                                             *references,
840                                                             level + 1),
841                                true);
842         }
843
844         if(do_unalinged_split) {
845                 inner->set_aligned_space(aligned_space);
846         }
847
848         return inner;
849 }
850
851 /* Create Nodes */
852
853 BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start, int num)
854 {
855         if(num == 0) {
856                 BoundBox bounds = BoundBox::empty;
857                 return new LeafNode(bounds, 0, 0, 0);
858         }
859         else if(num == 1) {
860                 assert(start < prim_type.size());
861                 prim_type[start] = ref->prim_type();
862                 prim_index[start] = ref->prim_index();
863                 prim_object[start] = ref->prim_object();
864                 if(need_prim_time) {
865                         prim_time[start] = make_float2(ref->time_from(), ref->time_to());
866                 }
867
868                 const uint visibility = objects[ref->prim_object()]->visibility_for_tracing();
869                 BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
870                 leaf_node->time_from = ref->time_from();
871                 leaf_node->time_to = ref->time_to();
872                 return leaf_node;
873         }
874         else {
875                 int mid = num/2;
876                 BVHNode *leaf0 = create_object_leaf_nodes(ref, start, mid);
877                 BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid);
878
879                 BoundBox bounds = BoundBox::empty;
880                 bounds.grow(leaf0->bounds);
881                 bounds.grow(leaf1->bounds);
882
883                 BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1);
884                 inner_node->time_from = min(leaf0->time_from, leaf1->time_from);
885                 inner_node->time_to = max(leaf0->time_to, leaf1->time_to);
886                 return inner_node;
887         }
888 }
889
890 BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
891                                     const vector<BVHReference>& references)
892 {
893         /* This is a bit overallocating here (considering leaf size into account),
894          * but chunk-based re-allocation in vector makes it difficult to use small
895          * size of stack storage here. Some tweaks are possible tho.
896          *
897          * NOTES:
898          *  - If the size is too big, we'll have inefficient stack usage,
899          *    and lots of cache misses.
900          *  - If the size is too small, then we can run out of memory
901          *    allowed to be used by vector.
902          *    In practice it wouldn't mean crash, just allocator will fallback
903          *    to heap which is slower.
904          *  - Optimistic re-allocation in STL could jump us out of stack usage
905          *    because re-allocation happens in chunks and size of those chunks we
906          *    can not control.
907          */
908         typedef StackAllocator<256, int> LeafStackAllocator;
909         typedef StackAllocator<256, float2> LeafTimeStackAllocator;
910         typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
911
912         vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
913         vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
914         vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
915         vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
916         vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
917
918         /* TODO(sergey): In theory we should be able to store references. */
919         vector<BVHReference, LeafReferenceStackAllocator> object_references;
920
921         uint visibility[PRIMITIVE_NUM_TOTAL] = {0};
922         /* NOTE: Keep initializtion in sync with actual number of primitives. */
923         BoundBox bounds[PRIMITIVE_NUM_TOTAL] = {BoundBox::empty,
924                                                 BoundBox::empty,
925                                                 BoundBox::empty,
926                                                 BoundBox::empty};
927         int ob_num = 0;
928         int num_new_prims = 0;
929         /* Fill in per-type type/index array. */
930         for(int i = 0; i < range.size(); i++) {
931                 const BVHReference& ref = references[range.start() + i];
932                 if(ref.prim_index() != -1) {
933                         int type_index = bitscan(ref.prim_type() & PRIMITIVE_ALL);
934                         p_ref[type_index].push_back(ref);
935                         p_type[type_index].push_back(ref.prim_type());
936                         p_index[type_index].push_back(ref.prim_index());
937                         p_object[type_index].push_back(ref.prim_object());
938                         p_time[type_index].push_back(make_float2(ref.time_from(),
939                                                                  ref.time_to()));
940
941                         bounds[type_index].grow(ref.bounds());
942                         visibility[type_index] |= objects[ref.prim_object()]->visibility_for_tracing();
943                         if(ref.prim_type() & PRIMITIVE_ALL_CURVE) {
944                                 visibility[type_index] |= PATH_RAY_CURVE;
945                         }
946                         ++num_new_prims;
947                 }
948                 else {
949                         object_references.push_back(ref);
950                         ++ob_num;
951                 }
952         }
953
954         /* Create leaf nodes for every existing primitive.
955          *
956          * Here we write primitive types, indices and objects to a temporary array.
957          * This way we keep all the heavy memory allocation code outside of the
958          * thread lock in the case of spatial split building.
959          *
960          * TODO(sergey): With some pointer trickery we can write directly to the
961          * destination buffers for the non-spatial split BVH.
962          */
963         BVHNode *leaves[PRIMITIVE_NUM_TOTAL + 1] = {NULL};
964         int num_leaves = 0;
965         size_t start_index = 0;
966         vector<int, LeafStackAllocator> local_prim_type,
967                                         local_prim_index,
968                                         local_prim_object;
969         vector<float2, LeafTimeStackAllocator> local_prim_time;
970         local_prim_type.resize(num_new_prims);
971         local_prim_index.resize(num_new_prims);
972         local_prim_object.resize(num_new_prims);
973         if(need_prim_time) {
974                 local_prim_time.resize(num_new_prims);
975         }
976         for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
977                 int num = (int)p_type[i].size();
978                 if(num != 0) {
979                         assert(p_type[i].size() == p_index[i].size());
980                         assert(p_type[i].size() == p_object[i].size());
981                         Transform aligned_space;
982                         bool alignment_found = false;
983                         for(int j = 0; j < num; ++j) {
984                                 const int index = start_index + j;
985                                 local_prim_type[index] = p_type[i][j];
986                                 local_prim_index[index] = p_index[i][j];
987                                 local_prim_object[index] = p_object[i][j];
988                                 if(need_prim_time) {
989                                         local_prim_time[index] = p_time[i][j];
990                                 }
991                                 if(params.use_unaligned_nodes && !alignment_found) {
992                                         alignment_found =
993                                                 unaligned_heuristic.compute_aligned_space(p_ref[i][j],
994                                                                                           &aligned_space);
995                                 }
996                         }
997                         LeafNode *leaf_node = new LeafNode(bounds[i],
998                                                            visibility[i],
999                                                            start_index,
1000                                                            start_index + num);
1001                         if(true) {
1002                                 float time_from = 1.0f, time_to = 0.0f;
1003                                 for(int j = 0; j < num; ++j) {
1004                                         const BVHReference &ref = p_ref[i][j];
1005                                         time_from = min(time_from, ref.time_from());
1006                                         time_to = max(time_to, ref.time_to());
1007                                 }
1008                                 leaf_node->time_from = time_from;
1009                                 leaf_node->time_to = time_to;
1010                         }
1011                         if(alignment_found) {
1012                                 /* Need to recalculate leaf bounds with new alignment. */
1013                                 leaf_node->bounds = BoundBox::empty;
1014                                 for(int j = 0; j < num; ++j) {
1015                                         const BVHReference &ref = p_ref[i][j];
1016                                         BoundBox ref_bounds =
1017                                                 unaligned_heuristic.compute_aligned_prim_boundbox(
1018                                                         ref,
1019                                                         aligned_space);
1020                                         leaf_node->bounds.grow(ref_bounds);
1021                                 }
1022                                 /* Set alignment space. */
1023                                 leaf_node->set_aligned_space(aligned_space);
1024                         }
1025                         leaves[num_leaves++] = leaf_node;
1026                         start_index += num;
1027                 }
1028         }
1029         /* Get size of new data to be copied to the packed arrays. */
1030         const int num_new_leaf_data = start_index;
1031         const size_t new_leaf_data_size = sizeof(int) * num_new_leaf_data;
1032         /* Copy actual data to the packed array. */
1033         if(params.use_spatial_split) {
1034                 spatial_spin_lock.lock();
1035                 /* We use first free index in the packed arrays and mode pointer to the
1036                  * end of the current range.
1037                  *
1038                  * This doesn't give deterministic packed arrays, but it shouldn't really
1039                  * matter because order of children in BVH is deterministic.
1040                  */
1041                 start_index = spatial_free_index;
1042                 spatial_free_index += range.size();
1043                 /* Extend an array when needed. */
1044                 const size_t range_end = start_index + range.size();
1045                 if(prim_type.size() < range_end) {
1046                         /* Avoid extra re-allocations by pre-allocating bigger array in an
1047                          * advance.
1048                          */
1049                         if(range_end >= prim_type.capacity()) {
1050                                 float progress = (float)progress_count/(float)progress_total;
1051                                 float factor = (1.0f - progress);
1052                                 const size_t reserve = (size_t)(range_end + (float)range_end*factor);
1053                                 prim_type.reserve(reserve);
1054                                 prim_index.reserve(reserve);
1055                                 prim_object.reserve(reserve);
1056                                 if(need_prim_time) {
1057                                         prim_time.reserve(reserve);
1058                                 }
1059                         }
1060
1061                         prim_type.resize(range_end);
1062                         prim_index.resize(range_end);
1063                         prim_object.resize(range_end);
1064                         if(need_prim_time) {
1065                                 prim_time.resize(range_end);
1066                         }
1067                 }
1068                 /* Perform actual data copy. */
1069                 if(new_leaf_data_size > 0) {
1070                         memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
1071                         memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
1072                         memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
1073                         if(need_prim_time) {
1074                                 memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
1075                         }
1076                 }
1077                 spatial_spin_lock.unlock();
1078         }
1079         else {
1080                 /* For the regular BVH builder we simply copy new data starting at the
1081                  * range start. This is totally thread-safe, all threads are living
1082                  * inside of their own range.
1083                  */
1084                 start_index = range.start();
1085                 if(new_leaf_data_size > 0) {
1086                         memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
1087                         memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
1088                         memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
1089                         if(need_prim_time) {
1090                                 memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
1091                         }
1092                 }
1093         }
1094
1095         /* So far leaves were created with the zero-based index in an arrays,
1096          * here we modify the indices to correspond to actual packed array start
1097          * index.
1098          */
1099         for(int i = 0; i < num_leaves; ++i) {
1100                 LeafNode *leaf = (LeafNode *)leaves[i];
1101                 leaf->lo += start_index;
1102                 leaf->hi += start_index;
1103         }
1104
1105         /* Create leaf node for object. */
1106         if(num_leaves == 0 || ob_num) {
1107                 /* Only create object leaf nodes if there are objects or no other
1108                  * nodes created.
1109                  */
1110                 const BVHReference *ref = (ob_num)? &object_references[0]: NULL;
1111                 leaves[num_leaves] = create_object_leaf_nodes(ref,
1112                                                               start_index + num_new_leaf_data,
1113                                                               ob_num);
1114                 ++num_leaves;
1115         }
1116
1117         /* TODO(sergey): Need to take care of alignment when number of leaves
1118          * is more than 1.
1119          */
1120         if(num_leaves == 1) {
1121                 /* Simplest case: single leaf, just return it.
1122                  * In all the rest cases we'll be creating intermediate inner node with
1123                  * an appropriate bounding box.
1124                  */
1125                 return leaves[0];
1126         }
1127         else if(num_leaves == 2) {
1128                 return new InnerNode(range.bounds(), leaves[0], leaves[1]);
1129         }
1130         else if(num_leaves == 3) {
1131                 BoundBox inner_bounds = merge(leaves[1]->bounds, leaves[2]->bounds);
1132                 BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
1133                 return new InnerNode(range.bounds(), leaves[0], inner);
1134         } else {
1135                 /* Should be doing more branches if more primitive types added. */
1136                 assert(num_leaves <= 5);
1137                 BoundBox inner_bounds_a = merge(leaves[0]->bounds, leaves[1]->bounds);
1138                 BoundBox inner_bounds_b = merge(leaves[2]->bounds, leaves[3]->bounds);
1139                 BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]);
1140                 BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]);
1141                 BoundBox inner_bounds_c = merge(inner_a->bounds, inner_b->bounds);
1142                 BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b);
1143                 if(num_leaves == 5) {
1144                         return new InnerNode(range.bounds(), inner_c, leaves[4]);
1145                 }
1146                 return inner_c;
1147         }
1148
1149 #undef MAX_ITEMS_PER_LEAF
1150 }
1151
1152 /* Tree Rotations */
1153
1154 void BVHBuild::rotate(BVHNode *node, int max_depth, int iterations)
1155 {
1156         /* in tested scenes, this resulted in slightly slower raytracing, so disabled
1157          * it for now. could be implementation bug, or depend on the scene */
1158         if(node)
1159                 for(int i = 0; i < iterations; i++)
1160                         rotate(node, max_depth);
1161 }
1162
1163 void BVHBuild::rotate(BVHNode *node, int max_depth)
1164 {
1165         /* nothing to rotate if we reached a leaf node. */
1166         if(node->is_leaf() || max_depth < 0)
1167                 return;
1168
1169         InnerNode *parent = (InnerNode*)node;
1170
1171         /* rotate all children first */
1172         for(size_t c = 0; c < 2; c++)
1173                 rotate(parent->children[c], max_depth-1);
1174
1175         /* compute current area of all children */
1176         BoundBox bounds0 = parent->children[0]->bounds;
1177         BoundBox bounds1 = parent->children[1]->bounds;
1178
1179         float area0 = bounds0.half_area();
1180         float area1 = bounds1.half_area();
1181         float4 child_area = make_float4(area0, area1, 0.0f, 0.0f);
1182
1183         /* find best rotation. we pick a target child of a first child, and swap
1184          * this with an other child. we perform the best such swap. */
1185         float best_cost = FLT_MAX;
1186         int best_child = -1, best_target = -1, best_other = -1;
1187
1188         for(size_t c = 0; c < 2; c++) {
1189                 /* ignore leaf nodes as we cannot descent into */
1190                 if(parent->children[c]->is_leaf())
1191                         continue;
1192
1193                 InnerNode *child = (InnerNode*)parent->children[c];
1194                 BoundBox& other = (c == 0)? bounds1: bounds0;
1195
1196                 /* transpose child bounds */
1197                 BoundBox target0 = child->children[0]->bounds;
1198                 BoundBox target1 = child->children[1]->bounds;
1199
1200                 /* compute cost for both possible swaps */
1201                 float cost0 = merge(other, target1).half_area() - child_area[c];
1202                 float cost1 = merge(target0, other).half_area() - child_area[c];
1203
1204                 if(min(cost0,cost1) < best_cost) {
1205                         best_child = (int)c;
1206                         best_other = (int)(1-c);
1207
1208                         if(cost0 < cost1) {
1209                                 best_cost = cost0;
1210                                 best_target = 0;
1211                         }
1212                         else {
1213                                 best_cost = cost0;
1214                                 best_target = 1;
1215                         }
1216                 }
1217         }
1218
1219         /* if we did not find a swap that improves the SAH then do nothing */
1220         if(best_cost >= 0)
1221                 return;
1222
1223         assert(best_child == 0 || best_child == 1);
1224         assert(best_target != -1);
1225
1226         /* perform the best found tree rotation */
1227         InnerNode *child = (InnerNode*)parent->children[best_child];
1228
1229         swap(parent->children[best_other], child->children[best_target]);
1230         child->bounds = merge(child->children[0]->bounds, child->children[1]->bounds);
1231 }
1232
1233 CCL_NAMESPACE_END