Approximate AO: Diffuse Bounce Hack
[blender.git] / source / blender / render / intern / source / occlusion.c
1 /* 
2  * $Id$
3  *
4  * ***** BEGIN GPL LICENSE BLOCK *****
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software Foundation,
18  * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19  *
20  * The Original Code is Copyright (C) 2008 Blender Foundation.
21  * All rights reserved.
22  *
23  * The Original Code is: all of this file.
24  *
25  * Contributor(s): Brecht Van Lommel.
26  *
27  * ***** END GPL LICENSE BLOCK *****
28  */
29
30 #include <math.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "MEM_guardedalloc.h"
36
37 #include "DNA_material_types.h"
38
39 #include "BLI_math.h"
40 #include "BLI_blenlib.h"
41 #include "BLI_memarena.h"
42 #include "BLI_threads.h"
43
44 #include "BKE_global.h"
45 #include "BKE_scene.h"
46 #include "BKE_utildefines.h"
47
48 #include "RE_shader_ext.h"
49
50 /* local includes */
51 #include "occlusion.h"
52 #include "render_types.h"
53 #include "rendercore.h"
54 #include "renderdatabase.h"
55 #include "pixelshading.h"
56 #include "shading.h"
57 #include "zbuf.h"
58
59 /* ------------------------- Declarations --------------------------- */
60
61 #define INVALID_INDEX ((int)(~0))
62 #define INVPI 0.31830988618379069f
63 #define TOTCHILD 8
64 #define CACHE_STEP 3
65
66 typedef struct OcclusionCacheSample {
67         float co[3], n[3], ao[3], indirect[3], intensity, dist2;
68         int x, y, filled;
69 } OcclusionCacheSample;
70
71 typedef struct OcclusionCache {
72         OcclusionCacheSample *sample;
73         int x, y, w, h, step;
74 } OcclusionCache;
75
76 typedef struct OccFace {
77         int obi;
78         int facenr;
79 } OccFace;
80
81 typedef struct OccNode {
82         float co[3], area;
83         float sh[9], dco;
84         float occlusion, rad[3];
85         int childflag;
86         union {
87                 //OccFace face;
88                 int face;
89                 struct OccNode *node;
90         } child[TOTCHILD];
91 } OccNode;
92
93 typedef struct OcclusionTree {
94         MemArena *arena;
95
96         float (*co)[3];         /* temporary during build */
97
98         OccFace *face;          /* instance and face indices */
99         float *occlusion;       /* occlusion for faces */
100         float (*rad)[3];        /* radiance for faces */
101         
102         OccNode *root;
103
104         OccNode **stack[BLENDER_MAX_THREADS];
105         int maxdepth;
106
107         int totface;
108
109         float error;
110         float distfac;
111
112         int dothreadedbuild;
113         int totbuildthread;
114
115         OcclusionCache *cache;
116 } OcclusionTree;
117
118 typedef struct OcclusionThread {
119         Render *re;
120         StrandSurface *mesh;
121         float (*faceao)[3];
122         float (*faceindirect)[3];
123         int begin, end;
124         int thread;
125 } OcclusionThread;
126
127 typedef struct OcclusionBuildThread {
128         OcclusionTree *tree;
129         int begin, end, depth;
130         OccNode *node;
131 } OcclusionBuildThread;
132
133 /* ------------------------- Shading --------------------------- */
134
135 extern Render R; // meh
136
137 static void occ_shade(ShadeSample *ssamp, ObjectInstanceRen *obi, VlakRen *vlr, float *rad)
138 {
139         ShadeInput *shi= ssamp->shi;
140         ShadeResult *shr= ssamp->shr;
141         float l, u, v, *v1, *v2, *v3;
142         
143         /* init */
144         if(vlr->v4) {
145                 shi->u= u= 0.5f;
146                 shi->v= v= 0.5f;
147         }
148         else {
149                 shi->u= u= 1.0f/3.0f;
150                 shi->v= v= 1.0f/3.0f;
151         }
152
153         /* setup render coordinates */
154         v1= vlr->v1->co;
155         v2= vlr->v2->co;
156         v3= vlr->v3->co;
157         
158         /* renderco */
159         l= 1.0f-u-v;
160         
161         shi->co[0]= l*v3[0]+u*v1[0]+v*v2[0];
162         shi->co[1]= l*v3[1]+u*v1[1]+v*v2[1];
163         shi->co[2]= l*v3[2]+u*v1[2]+v*v2[2];
164
165         shade_input_set_triangle_i(shi, obi, vlr, 0, 1, 2);
166
167         /* set up view vector */
168         VECCOPY(shi->view, shi->co);
169         normalize_v3(shi->view);
170         
171         /* cache for shadow */
172         shi->samplenr++;
173         
174         shi->xs= 0; // TODO
175         shi->ys= 0;
176         
177         shade_input_set_normals(shi);
178
179         /* no normal flip */
180         if(shi->flippednor)
181                 shade_input_flip_normals(shi);
182
183         madd_v3_v3fl(shi->co, shi->vn, 0.0001f); /* ugly.. */
184
185         /* not a pretty solution, but fixes common cases */
186         if(shi->obr->ob && shi->obr->ob->transflag & OB_NEG_SCALE) {
187                 negate_v3(shi->vn);
188                 negate_v3(shi->vno);
189         }
190
191         /* init material vars */
192         // note, keep this synced with render_types.h
193         memcpy(&shi->r, &shi->mat->r, 23*sizeof(float));
194         shi->har= shi->mat->har;
195         
196         /* render */
197         shade_input_set_shade_texco(shi);
198         shade_material_loop(shi, shr); /* todo: nodes */
199         
200         VECCOPY(rad, shr->combined);
201 }
202
203 static void occ_build_shade(Render *re, OcclusionTree *tree)
204 {
205         ShadeSample ssamp;
206         ObjectInstanceRen *obi;
207         VlakRen *vlr;
208         int a;
209
210         R= *re;
211
212         /* setup shade sample with correct passes */
213         memset(&ssamp, 0, sizeof(ShadeSample));
214         ssamp.shi[0].lay= re->scene->lay;
215         ssamp.shi[0].passflag= SCE_PASS_DIFFUSE|SCE_PASS_RGBA;
216         ssamp.shi[0].combinedflag= ~(SCE_PASS_SPEC);
217         ssamp.tot= 1;
218
219         for(a=0; a<tree->totface; a++) {
220                 obi= &R.objectinstance[tree->face[a].obi];
221                 vlr= RE_findOrAddVlak(obi->obr, tree->face[a].facenr);
222
223                 occ_shade(&ssamp, obi, vlr, tree->rad[a]);
224         }
225 }
226
227 /* ------------------------- Spherical Harmonics --------------------------- */
228
229 /* Use 2nd order SH => 9 coefficients, stored in this order:
230    0 = (0,0),
231    1 = (1,-1), 2 = (1,0), 3 = (1,1),
232    4 = (2,-2), 5 = (2,-1), 6 = (2,0), 7 = (2,1), 8 = (2,2) */
233
234 static void sh_copy(float *shresult, float *sh)
235 {
236         memcpy(shresult, sh, sizeof(float)*9);
237 }
238
239 static void sh_mul(float *sh, float f)
240 {
241         int i;
242
243         for(i=0; i<9; i++)
244                 sh[i] *= f;
245 }
246
247 static void sh_add(float *shresult, float *sh1, float *sh2)
248 {
249         int i;
250
251         for(i=0; i<9; i++)
252                 shresult[i]= sh1[i] + sh2[i];
253 }
254
255 static void sh_from_disc(float *n, float area, float *shresult)
256 {
257         /* See formula (3) in:
258            "An Efficient Representation for Irradiance Environment Maps" */
259         float sh[9], x, y, z;
260
261         x= n[0];
262         y= n[1];
263         z= n[2];
264
265         sh[0]= 0.282095f;
266
267         sh[1]= 0.488603f*y;
268         sh[2]= 0.488603f*z;
269         sh[3]= 0.488603f*x;
270         
271         sh[4]= 1.092548f*x*y;
272         sh[5]= 1.092548f*y*z;
273         sh[6]= 0.315392f*(3.0f*z*z - 1.0f);
274         sh[7]= 1.092548f*x*z;
275         sh[8]= 0.546274f*(x*x - y*y);
276
277         sh_mul(sh, area);
278         sh_copy(shresult, sh);
279 }
280
281 static float sh_eval(float *sh, float *v)
282 {
283         /* See formula (13) in:
284            "An Efficient Representation for Irradiance Environment Maps" */
285         static const float c1 = 0.429043f, c2 = 0.511664f, c3 = 0.743125f;
286         static const float c4 = 0.886227f, c5 = 0.247708f;
287         float x, y, z, sum;
288
289         x= v[0];
290         y= v[1];
291         z= v[2];
292
293         sum= c1*sh[8]*(x*x - y*y);
294         sum += c3*sh[6]*z*z;
295         sum += c4*sh[0];
296         sum += -c5*sh[6];
297         sum += 2.0f*c1*(sh[4]*x*y + sh[7]*x*z + sh[5]*y*z);
298         sum += 2.0f*c2*(sh[3]*x + sh[1]*y + sh[2]*z);
299
300         return sum;
301 }
302
303 /* ------------------------------ Building --------------------------------- */
304
305 static void occ_face(const OccFace *face, float *co, float *normal, float *area)
306 {
307         ObjectInstanceRen *obi;
308         VlakRen *vlr;
309         float v1[3], v2[3], v3[3], v4[3];
310
311         obi= &R.objectinstance[face->obi];
312         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
313         
314         if(co) {
315                 if(vlr->v4)
316                         interp_v3_v3v3(co, vlr->v1->co, vlr->v3->co, 0.5f);
317                 else
318                         cent_tri_v3(co, vlr->v1->co, vlr->v2->co, vlr->v3->co);
319
320                 if(obi->flag & R_TRANSFORMED)
321                         mul_m4_v3(obi->mat, co);
322         }
323         
324         if(normal) {
325                 normal[0]= -vlr->n[0];
326                 normal[1]= -vlr->n[1];
327                 normal[2]= -vlr->n[2];
328
329                 if(obi->flag & R_TRANSFORMED)
330                         mul_m3_v3(obi->nmat, normal);
331         }
332
333         if(area) {
334                 VECCOPY(v1, vlr->v1->co);
335                 VECCOPY(v2, vlr->v2->co);
336                 VECCOPY(v3, vlr->v3->co);
337                 if(vlr->v4) VECCOPY(v4, vlr->v4->co);
338
339                 if(obi->flag & R_TRANSFORMED) {
340                         mul_m4_v3(obi->mat, v1);
341                         mul_m4_v3(obi->mat, v2);
342                         mul_m4_v3(obi->mat, v3);
343                         if(vlr->v4) mul_m4_v3(obi->mat, v4);
344                 }
345
346                 /* todo: correct area for instances */
347                 if(vlr->v4)
348                         *area= area_quad_v3(v1, v2, v3, v4);
349                 else
350                         *area= area_tri_v3(v1, v2, v3);
351         }
352 }
353
354 static void occ_sum_occlusion(OcclusionTree *tree, OccNode *node)
355 {
356         OccNode *child;
357         float occ, area, totarea, rad[3];
358         int a, b;
359
360         occ= 0.0f;
361         totarea= 0.0f;
362         zero_v3(rad);
363
364         for(b=0; b<TOTCHILD; b++) {
365                 if(node->childflag & (1<<b)) {
366                         a= node->child[b].face;
367                         occ_face(&tree->face[a], 0, 0, &area);
368                         occ += area*tree->occlusion[a];
369                         madd_v3_v3fl(rad, tree->rad[a], area);
370                         totarea += area;
371                 }
372                 else if(node->child[b].node) {
373                         child= node->child[b].node;
374                         occ_sum_occlusion(tree, child);
375
376                         occ += child->area*child->occlusion;
377                         madd_v3_v3fl(rad, child->rad, child->area);
378                         totarea += child->area;
379                 }
380         }
381
382         if(totarea != 0.0f) {
383                 occ /= totarea;
384                 mul_v3_fl(rad, 1.0f/totarea);
385         }
386         
387         node->occlusion= occ;
388         copy_v3_v3(node->rad, rad);
389 }
390
391 static int occ_find_bbox_axis(OcclusionTree *tree, int begin, int end, float *min, float *max)
392 {
393         float len, maxlen= -1.0f;
394         int a, axis = 0;
395
396         INIT_MINMAX(min, max);
397
398         for(a=begin; a<end; a++)
399                 DO_MINMAX(tree->co[a], min, max)
400
401         for(a=0; a<3; a++) {
402                 len= max[a] - min[a];
403
404                 if(len > maxlen) {
405                         maxlen= len;
406                         axis= a;
407                 }
408         }
409
410         return axis;
411 }
412
413 static void occ_node_from_face(OccFace *face, OccNode *node)
414 {
415         float n[3];
416
417         occ_face(face, node->co, n, &node->area);
418         node->dco= 0.0f;
419         sh_from_disc(n, node->area, node->sh);
420 }
421
422 static void occ_build_dco(OcclusionTree *tree, OccNode *node, float *co, float *dco)
423 {
424         OccNode *child;
425         float dist, d[3], nco[3];
426         int b;
427
428         for(b=0; b<TOTCHILD; b++) {
429                 if(node->childflag & (1<<b)) {
430                         occ_face(tree->face+node->child[b].face, nco, 0, 0);
431                 }
432                 else if(node->child[b].node) {
433                         child= node->child[b].node;
434                         occ_build_dco(tree, child, co, dco);
435                         VECCOPY(nco, child->co);
436                 }
437
438                 VECSUB(d, nco, co);
439                 dist= INPR(d, d);
440                 if(dist > *dco)
441                         *dco= dist;
442         }
443 }
444
445 static void occ_build_split(OcclusionTree *tree, int begin, int end, int *split)
446 {
447         float min[3], max[3], mid;
448         int axis, a, enda;
449
450         /* split in middle of boundbox. this seems faster than median split
451          * on complex scenes, possibly since it avoids two distant faces to
452          * be in the same node better? */
453         axis= occ_find_bbox_axis(tree, begin, end, min, max);
454         mid= 0.5f*(min[axis]+max[axis]);
455
456         a= begin;
457         enda= end;
458         while(a<enda) {
459                 if(tree->co[a][axis] > mid) {
460                         enda--;
461                         SWAP(OccFace, tree->face[a], tree->face[enda]);
462                         SWAP(float, tree->co[a][0], tree->co[enda][0]);
463                         SWAP(float, tree->co[a][1], tree->co[enda][1]);
464                         SWAP(float, tree->co[a][2], tree->co[enda][2]);
465                 }
466                 else
467                         a++;
468         }
469
470         *split= enda;
471 }
472
473 static void occ_build_8_split(OcclusionTree *tree, int begin, int end, int *offset, int *count)
474 {
475         /* split faces into eight groups */
476         int b, splitx, splity[2], splitz[4];
477
478         occ_build_split(tree, begin, end, &splitx);
479
480         /* force split if none found, to deal with degenerate geometry */
481         if(splitx == begin || splitx == end)
482                 splitx= (begin+end)/2;
483
484         occ_build_split(tree, begin, splitx, &splity[0]);
485         occ_build_split(tree, splitx, end, &splity[1]);
486
487         occ_build_split(tree, begin, splity[0], &splitz[0]);
488         occ_build_split(tree, splity[0], splitx, &splitz[1]);
489         occ_build_split(tree, splitx, splity[1], &splitz[2]);
490         occ_build_split(tree, splity[1], end, &splitz[3]);
491
492         offset[0]= begin;
493         offset[1]= splitz[0];
494         offset[2]= splity[0];
495         offset[3]= splitz[1];
496         offset[4]= splitx;
497         offset[5]= splitz[2];
498         offset[6]= splity[1];
499         offset[7]= splitz[3];
500
501         for(b=0; b<7; b++)
502                 count[b]= offset[b+1] - offset[b];
503         count[7]= end - offset[7];
504 }
505
506 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth);
507
508 static void *exec_occ_build(void *data)
509 {
510         OcclusionBuildThread *othread= (OcclusionBuildThread*)data;
511
512         occ_build_recursive(othread->tree, othread->node, othread->begin, othread->end, othread->depth);
513
514         return 0;
515 }
516
517 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth)
518 {
519         ListBase threads;
520         OcclusionBuildThread othreads[BLENDER_MAX_THREADS];
521         OccNode *child, tmpnode;
522         OccFace *face;
523         int a, b, totthread=0, offset[TOTCHILD], count[TOTCHILD];
524
525         /* add a new node */
526         node->occlusion= 1.0f;
527
528         /* leaf node with only children */
529         if(end - begin <= TOTCHILD) {
530                 for(a=begin, b=0; a<end; a++, b++) {
531                         face= &tree->face[a];
532                         node->child[b].face= a;
533                         node->childflag |= (1<<b);
534                 }
535         }
536         else {
537                 /* order faces */
538                 occ_build_8_split(tree, begin, end, offset, count);
539
540                 if(depth == 1 && tree->dothreadedbuild)
541                         BLI_init_threads(&threads, exec_occ_build, tree->totbuildthread);
542
543                 for(b=0; b<TOTCHILD; b++) {
544                         if(count[b] == 0) {
545                                 node->child[b].node= NULL;
546                         }
547                         else if(count[b] == 1) {
548                                 face= &tree->face[offset[b]];
549                                 node->child[b].face= offset[b];
550                                 node->childflag |= (1<<b);
551                         }
552                         else {
553                                 if(tree->dothreadedbuild)
554                                         BLI_lock_thread(LOCK_CUSTOM1);
555
556                                 child= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
557                                 node->child[b].node= child;
558
559                                 /* keep track of maximum depth for stack */
560                                 if(depth+1 > tree->maxdepth)
561                                         tree->maxdepth= depth+1;
562
563                                 if(tree->dothreadedbuild)
564                                         BLI_unlock_thread(LOCK_CUSTOM1);
565
566                                 if(depth == 1 && tree->dothreadedbuild) {
567                                         othreads[totthread].tree= tree;
568                                         othreads[totthread].node= child;
569                                         othreads[totthread].begin= offset[b];
570                                         othreads[totthread].end= offset[b]+count[b];
571                                         othreads[totthread].depth= depth+1;
572                                         BLI_insert_thread(&threads, &othreads[totthread]);
573                                         totthread++;
574                                 }
575                                 else
576                                         occ_build_recursive(tree, child, offset[b], offset[b]+count[b], depth+1);
577                         }
578                 }
579
580                 if(depth == 1 && tree->dothreadedbuild)
581                         BLI_end_threads(&threads);
582         }
583
584         /* combine area, position and sh */
585         for(b=0; b<TOTCHILD; b++) {
586                 if(node->childflag & (1<<b)) {
587                         child= &tmpnode;
588                         occ_node_from_face(tree->face+node->child[b].face, &tmpnode);
589                 }
590                 else {
591                         child= node->child[b].node;
592                 }
593
594                 if(child) {
595                         node->area += child->area;
596                         sh_add(node->sh, node->sh, child->sh);
597                         VECADDFAC(node->co, node->co, child->co, child->area);
598                 }
599         }
600
601         if(node->area != 0.0f)
602                 mul_v3_fl(node->co, 1.0f/node->area);
603
604         /* compute maximum distance from center */
605         node->dco= 0.0f;
606         occ_build_dco(tree, node, node->co, &node->dco);
607 }
608
609 static void occ_build_sh_normalize(OccNode *node)
610 {
611         /* normalize spherical harmonics to not include area, so
612          * we can clamp the dot product and then mutliply by area */
613         int b;
614
615         if(node->area != 0.0f)
616                 sh_mul(node->sh, 1.0f/node->area);
617
618         for(b=0; b<TOTCHILD; b++) {
619                 if(node->childflag & (1<<b));
620                 else if(node->child[b].node)
621                         occ_build_sh_normalize(node->child[b].node);
622         }
623 }
624
625 static OcclusionTree *occ_tree_build(Render *re)
626 {
627         OcclusionTree *tree;
628         ObjectInstanceRen *obi;
629         ObjectRen *obr;
630         VlakRen *vlr= NULL;
631         int a, b, c, totface;
632
633         /* count */
634         totface= 0;
635         for(obi=re->instancetable.first; obi; obi=obi->next) {
636                 obr= obi->obr;
637                 for(a=0; a<obr->totvlak; a++) {
638                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
639                         else vlr++;
640
641                         if((vlr->mat->mode & MA_TRACEBLE) && (vlr->mat->material_type == MA_TYPE_SURFACE))
642                                 totface++;
643                 }
644         }
645
646         if(totface == 0)
647                 return NULL;
648         
649         tree= MEM_callocN(sizeof(OcclusionTree), "OcclusionTree");
650         tree->totface= totface;
651
652         /* parameters */
653         tree->error= get_render_aosss_error(&re->r, re->wrld.ao_approx_error);
654         tree->distfac= (re->wrld.aomode & WO_AODIST)? re->wrld.aodistfac: 0.0f;
655
656         /* allocation */
657         tree->arena= BLI_memarena_new(0x8000 * sizeof(OccNode));
658         BLI_memarena_use_calloc(tree->arena);
659
660         if(re->wrld.aomode & WO_AOCACHE)
661                 tree->cache= MEM_callocN(sizeof(OcclusionCache)*BLENDER_MAX_THREADS, "OcclusionCache");
662
663         tree->face= MEM_callocN(sizeof(OccFace)*totface, "OcclusionFace");
664         tree->co= MEM_callocN(sizeof(float)*3*totface, "OcclusionCo");
665         tree->occlusion= MEM_callocN(sizeof(float)*totface, "OcclusionOcclusion");
666
667         if(re->wrld.ao_indirect_energy != 0.0f)
668                 tree->rad= MEM_callocN(sizeof(float)*3*totface, "OcclusionRad");
669
670         /* make array of face pointers */
671         for(b=0, c=0, obi=re->instancetable.first; obi; obi=obi->next, c++) {
672                 obr= obi->obr;
673                 for(a=0; a<obr->totvlak; a++) {
674                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
675                         else vlr++;
676
677                         if((vlr->mat->mode & MA_TRACEBLE) && (vlr->mat->material_type == MA_TYPE_SURFACE)) {
678                                 tree->face[b].obi= c;
679                                 tree->face[b].facenr= a;
680                                 tree->occlusion[b]= 1.0f;
681                                 occ_face(&tree->face[b], tree->co[b], NULL, NULL); 
682                                 b++;
683                         }
684                 }
685         }
686
687         /* threads */
688         tree->totbuildthread= (re->r.threads > 1 && totface > 10000)? 8: 1;
689         tree->dothreadedbuild= (tree->totbuildthread > 1);
690
691         /* recurse */
692         tree->root= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
693         tree->maxdepth= 1;
694         occ_build_recursive(tree, tree->root, 0, totface, 1);
695
696         if(re->wrld.ao_indirect_energy != 0.0f) {
697                 occ_build_shade(re, tree);
698                 occ_sum_occlusion(tree, tree->root);
699         }
700         
701         MEM_freeN(tree->co);
702         tree->co= NULL;
703
704         occ_build_sh_normalize(tree->root);
705
706         for(a=0; a<BLENDER_MAX_THREADS; a++)
707                 tree->stack[a]= MEM_callocN(sizeof(OccNode)*TOTCHILD*(tree->maxdepth+1), "OccStack");
708
709         return tree;
710 }
711
712 static void occ_free_tree(OcclusionTree *tree)
713 {
714         int a;
715
716         if(tree) {
717                 if(tree->arena) BLI_memarena_free(tree->arena);
718                 for(a=0; a<BLENDER_MAX_THREADS; a++)
719                         if(tree->stack[a])
720                                 MEM_freeN(tree->stack[a]);
721                 if(tree->occlusion) MEM_freeN(tree->occlusion);
722                 if(tree->cache) MEM_freeN(tree->cache);
723                 if(tree->face) MEM_freeN(tree->face);
724                 if(tree->rad) MEM_freeN(tree->rad);
725                 MEM_freeN(tree);
726         }
727 }
728
729 /* ------------------------- Traversal --------------------------- */
730
731 static float occ_solid_angle(OccNode *node, float *v, float d2, float invd2, float *receivenormal)
732 {
733         float dotreceive, dotemit;
734         float ev[3];
735
736         ev[0]= -v[0]*invd2;
737         ev[1]= -v[1]*invd2;
738         ev[2]= -v[2]*invd2;
739         dotemit= sh_eval(node->sh, ev);
740         dotreceive= INPR(receivenormal, v)*invd2;
741
742         CLAMP(dotemit, 0.0f, 1.0f);
743         CLAMP(dotreceive, 0.0f, 1.0f);
744         
745         return ((node->area*dotemit*dotreceive)/(d2 + node->area*INVPI))*INVPI;
746 }
747
748 static void VecAddDir(float *result, float *v1, float *v2, float fac)
749 {
750         result[0]= v1[0] + fac*(v2[0] - v1[0]);
751         result[1]= v1[1] + fac*(v2[1] - v1[1]);
752         result[2]= v1[2] + fac*(v2[2] - v1[2]);
753 }
754
755 static int occ_visible_quad(float *p, float *n, float *v0, float *v1, float *v2, float *q0, float *q1, float *q2, float *q3)
756 {
757         static const float epsilon = 1e-6f;
758         float c, sd[3];
759         
760         c= INPR(n, p);
761
762         /* signed distances from the vertices to the plane. */
763         sd[0]= INPR(n, v0) - c;
764         sd[1]= INPR(n, v1) - c;
765         sd[2]= INPR(n, v2) - c;
766
767         if(fabsf(sd[0]) < epsilon) sd[0] = 0.0f;
768         if(fabsf(sd[1]) < epsilon) sd[1] = 0.0f;
769         if(fabsf(sd[2]) < epsilon) sd[2] = 0.0f;
770
771         if(sd[0] > 0) {
772                 if(sd[1] > 0) {
773                         if(sd[2] > 0) {
774                                 // +++
775                                 VECCOPY(q0, v0);
776                                 VECCOPY(q1, v1);
777                                 VECCOPY(q2, v2);
778                                 VECCOPY(q3, q2);
779                         }
780                         else if(sd[2] < 0) {
781                                 // ++-
782                                 VECCOPY(q0, v0);
783                                 VECCOPY(q1, v1);
784                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
785                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
786                         }
787                         else {
788                                 // ++0
789                                 VECCOPY(q0, v0);
790                                 VECCOPY(q1, v1);
791                                 VECCOPY(q2, v2);
792                                 VECCOPY(q3, q2);
793                         }
794                 }
795                 else if(sd[1] < 0) {
796                         if(sd[2] > 0) {
797                                 // +-+
798                                 VECCOPY(q0, v0);
799                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
800                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
801                                 VECCOPY(q3, v2);
802                         }
803                         else if(sd[2] < 0) {
804                                 // +--
805                                 VECCOPY(q0, v0);
806                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
807                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
808                                 VECCOPY(q3, q2);
809                         }
810                         else {
811                                 // +-0
812                                 VECCOPY(q0, v0);
813                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
814                                 VECCOPY(q2, v2);
815                                 VECCOPY(q3, q2);
816                         }
817                 }
818                 else {
819                         if(sd[2] > 0) {
820                                 // +0+
821                                 VECCOPY(q0, v0);
822                                 VECCOPY(q1, v1);
823                                 VECCOPY(q2, v2);
824                                 VECCOPY(q3, q2);
825                         }
826                         else if(sd[2] < 0) {
827                                 // +0-
828                                 VECCOPY(q0, v0);
829                                 VECCOPY(q1, v1);
830                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
831                                 VECCOPY(q3, q2);
832                         }
833                         else {
834                                 // +00
835                                 VECCOPY(q0, v0);
836                                 VECCOPY(q1, v1);
837                                 VECCOPY(q2, v2);
838                                 VECCOPY(q3, q2);
839                         }
840                 }
841         }
842         else if(sd[0] < 0) {
843                 if(sd[1] > 0) {
844                         if(sd[2] > 0) {
845                                 // -++
846                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
847                                 VECCOPY(q1, v1);
848                                 VECCOPY(q2, v2);
849                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
850                         }
851                         else if(sd[2] < 0) {
852                                 // -+-
853                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
854                                 VECCOPY(q1, v1);
855                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
856                                 VECCOPY(q3, q2);
857                         }
858                         else {
859                                 // -+0
860                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
861                                 VECCOPY(q1, v1);
862                                 VECCOPY(q2, v2);
863                                 VECCOPY(q3, q2);
864                         }
865                 }
866                 else if(sd[1] < 0) {
867                         if(sd[2] > 0) {
868                                 // --+
869                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
870                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
871                                 VECCOPY(q2, v2);
872                                 VECCOPY(q3, q2);
873                         }
874                         else if(sd[2] < 0) {
875                                 // ---
876                                 return 0;
877                         }
878                         else {
879                                 // --0
880                                 return 0;
881                         }
882                 }
883                 else {
884                         if(sd[2] > 0) {
885                                 // -0+
886                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
887                                 VECCOPY(q1, v1);
888                                 VECCOPY(q2, v2);
889                                 VECCOPY(q3, q2);
890                         }
891                         else if(sd[2] < 0) {
892                                 // -0-
893                                 return 0;
894                         }
895                         else {
896                                 // -00
897                                 return 0;
898                         }
899                 }
900         }
901         else {
902                 if(sd[1] > 0) {
903                         if(sd[2] > 0) {
904                                 // 0++
905                                 VECCOPY(q0, v0);
906                                 VECCOPY(q1, v1);
907                                 VECCOPY(q2, v2);
908                                 VECCOPY(q3, q2);
909                         }
910                         else if(sd[2] < 0) {
911                                 // 0+-
912                                 VECCOPY(q0, v0);
913                                 VECCOPY(q1, v1);
914                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
915                                 VECCOPY(q3, q2);
916                         }
917                         else {
918                                 // 0+0
919                                 VECCOPY(q0, v0);
920                                 VECCOPY(q1, v1);
921                                 VECCOPY(q2, v2);
922                                 VECCOPY(q3, q2);
923                         }
924                 }
925                 else if(sd[1] < 0) {
926                         if(sd[2] > 0) {
927                                 // 0-+
928                                 VECCOPY(q0, v0);
929                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
930                                 VECCOPY(q2, v2);
931                                 VECCOPY(q3, q2);
932                         }
933                         else if(sd[2] < 0) {
934                                 // 0--
935                                 return 0;
936                         }
937                         else {
938                                 // 0-0
939                                 return 0;
940                         }
941                 }
942                 else {
943                         if(sd[2] > 0) {
944                                 // 00+
945                                 VECCOPY(q0, v0);
946                                 VECCOPY(q1, v1);
947                                 VECCOPY(q2, v2);
948                                 VECCOPY(q3, q2);
949                         }
950                         else if(sd[2] < 0) {
951                                 // 00-
952                                 return 0;
953                         }
954                         else {
955                                 // 000
956                                 return 0;
957                         }
958                 }
959         }
960
961         return 1;
962 }
963
964 /* altivec optimization, this works, but is unused */
965
966 #if 0
967 #include <Accelerate/Accelerate.h>
968
969 typedef union {
970         vFloat v;
971         float f[4];
972 } vFloatResult;
973
974 static vFloat vec_splat_float(float val)
975 {
976         return (vFloat){val, val, val, val};
977 }
978
979 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
980 {
981         vFloat vcos, rlen, vrx, vry, vrz, vsrx, vsry, vsrz, gx, gy, gz, vangle;
982         vUInt8 rotate = (vUInt8){4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3};
983         vFloatResult vresult;
984         float result;
985
986         /* compute r* */
987         vrx = (vFloat){q0[0], q1[0], q2[0], q3[0]} - vec_splat_float(p[0]);
988         vry = (vFloat){q0[1], q1[1], q2[1], q3[1]} - vec_splat_float(p[1]);
989         vrz = (vFloat){q0[2], q1[2], q2[2], q3[2]} - vec_splat_float(p[2]);
990
991         /* normalize r* */
992         rlen = vec_rsqrte(vrx*vrx + vry*vry + vrz*vrz + vec_splat_float(1e-16f));
993         vrx = vrx*rlen;
994         vry = vry*rlen;
995         vrz = vrz*rlen;
996
997         /* rotate r* for cross and dot */
998         vsrx= vec_perm(vrx, vrx, rotate);
999         vsry= vec_perm(vry, vry, rotate);
1000         vsrz= vec_perm(vrz, vrz, rotate);
1001
1002         /* cross product */
1003         gx = vsry*vrz - vsrz*vry;
1004         gy = vsrz*vrx - vsrx*vrz;
1005         gz = vsrx*vry - vsry*vrx;
1006
1007         /* normalize */
1008         rlen = vec_rsqrte(gx*gx + gy*gy + gz*gz + vec_splat_float(1e-16f));
1009         gx = gx*rlen;
1010         gy = gy*rlen;
1011         gz = gz*rlen;
1012
1013         /* angle */
1014         vcos = vrx*vsrx + vry*vsry + vrz*vsrz;
1015         vcos= vec_max(vec_min(vcos, vec_splat_float(1.0f)), vec_splat_float(-1.0f));
1016         vangle= vacosf(vcos);
1017
1018         /* dot */
1019         vresult.v = (vec_splat_float(n[0])*gx +
1020                      vec_splat_float(n[1])*gy +
1021                      vec_splat_float(n[2])*gz)*vangle;
1022
1023         result= (vresult.f[0] + vresult.f[1] + vresult.f[2] + vresult.f[3])*(0.5f/(float)M_PI);
1024         result= MAX2(result, 0.0f);
1025
1026         return result;
1027 }
1028
1029 #endif
1030
1031 /* SSE optimization, acos code doesn't work */
1032
1033 #if 0
1034
1035 #include <xmmintrin.h>
1036
1037 static __m128 sse_approx_acos(__m128 x)
1038 {
1039         /* needs a better approximation than taylor expansion of acos, since that
1040          * gives big erros for near 1.0 values, sqrt(2*x)*acos(1-x) should work
1041          * better, see http://www.tom.womack.net/projects/sse-fast-arctrig.html */
1042
1043         return _mm_set_ps1(1.0f);
1044 }
1045
1046 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1047 {
1048         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1049         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1050         float fresult[4] __attribute__((aligned(16)));
1051         __m128 qx, qy, qz, rx, ry, rz, rlen, srx, sry, srz, gx, gy, gz, glen, rcos, angle, aresult;
1052
1053         /* compute r */
1054         qx = _mm_set_ps(q3[0], q2[0], q1[0], q0[0]);
1055         qy = _mm_set_ps(q3[1], q2[1], q1[1], q0[1]);
1056         qz = _mm_set_ps(q3[2], q2[2], q1[2], q0[2]);
1057
1058         rx = qx - _mm_set_ps1(p[0]);
1059         ry = qy - _mm_set_ps1(p[1]);
1060         rz = qz - _mm_set_ps1(p[2]);
1061
1062         /* normalize r */
1063         rlen = _mm_rsqrt_ps(rx*rx + ry*ry + rz*rz + _mm_set_ps1(1e-16f));
1064         rx = rx*rlen;
1065         ry = ry*rlen;
1066         rz = rz*rlen;
1067
1068         /* cross product */
1069         srx = _mm_shuffle_ps(rx, rx, _MM_SHUFFLE(0,3,2,1));
1070         sry = _mm_shuffle_ps(ry, ry, _MM_SHUFFLE(0,3,2,1));
1071         srz = _mm_shuffle_ps(rz, rz, _MM_SHUFFLE(0,3,2,1));
1072
1073         gx = sry*rz - srz*ry;
1074         gy = srz*rx - srx*rz;
1075         gz = srx*ry - sry*rx;
1076
1077         /* normalize g */
1078         glen = _mm_rsqrt_ps(gx*gx + gy*gy + gz*gz + _mm_set_ps1(1e-16f));
1079         gx = gx*glen;
1080         gy = gy*glen;
1081         gz = gz*glen;
1082
1083         /* compute angle */
1084         rcos = rx*srx + ry*sry + rz*srz;
1085         rcos= _mm_max_ps(_mm_min_ps(rcos, _mm_set_ps1(1.0f)), _mm_set_ps1(-1.0f));
1086
1087         angle = sse_approx_cos(rcos);
1088         aresult = (_mm_set_ps1(n[0])*gx + _mm_set_ps1(n[1])*gy + _mm_set_ps1(n[2])*gz)*angle;
1089
1090         /* sum together */
1091     result= (fresult[0] + fresult[1] + fresult[2] + fresult[3])*(0.5f/(float)M_PI);
1092         result= MAX2(result, 0.0f);
1093
1094         return result;
1095 }
1096
1097 #endif
1098
1099 static void normalizef(float *n)
1100 {
1101         float d;
1102         
1103         d= INPR(n, n);
1104
1105         if(d > 1.0e-35F) {
1106                 d= 1.0f/sqrtf(d);
1107
1108                 n[0] *= d; 
1109                 n[1] *= d; 
1110                 n[2] *= d;
1111         } 
1112 }
1113
1114 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1115 {
1116         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1117         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1118
1119         VECSUB(r0, q0, p);
1120         VECSUB(r1, q1, p);
1121         VECSUB(r2, q2, p);
1122         VECSUB(r3, q3, p);
1123
1124         normalizef(r0);
1125         normalizef(r1);
1126         normalizef(r2);
1127         normalizef(r3);
1128
1129         cross_v3_v3v3(g0, r1, r0); normalizef(g0);
1130         cross_v3_v3v3(g1, r2, r1); normalizef(g1);
1131         cross_v3_v3v3(g2, r3, r2); normalizef(g2);
1132         cross_v3_v3v3(g3, r0, r3); normalizef(g3);
1133
1134         a1= saacosf(INPR(r0, r1));
1135         a2= saacosf(INPR(r1, r2));
1136         a3= saacosf(INPR(r2, r3));
1137         a4= saacosf(INPR(r3, r0));
1138
1139         dot1= INPR(n, g0);
1140         dot2= INPR(n, g1);
1141         dot3= INPR(n, g2);
1142         dot4= INPR(n, g3);
1143
1144         result= (a1*dot1 + a2*dot2 + a3*dot3 + a4*dot4)*0.5f/(float)M_PI;
1145         result= MAX2(result, 0.0f);
1146
1147         return result;
1148 }
1149
1150 static float occ_form_factor(OccFace *face, float *p, float *n)
1151 {
1152         ObjectInstanceRen *obi;
1153         VlakRen *vlr;
1154         float v1[3], v2[3], v3[3], v4[3], q0[3], q1[3], q2[3], q3[3], contrib= 0.0f;
1155
1156         obi= &R.objectinstance[face->obi];
1157         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
1158
1159         VECCOPY(v1, vlr->v1->co);
1160         VECCOPY(v2, vlr->v2->co);
1161         VECCOPY(v3, vlr->v3->co);
1162
1163         if(obi->flag & R_TRANSFORMED) {
1164                 mul_m4_v3(obi->mat, v1);
1165                 mul_m4_v3(obi->mat, v2);
1166                 mul_m4_v3(obi->mat, v3);
1167         }
1168
1169         if(occ_visible_quad(p, n, v1, v2, v3, q0, q1, q2, q3))
1170                 contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1171
1172         if(vlr->v4) {
1173                 VECCOPY(v4, vlr->v4->co);
1174                 if(obi->flag & R_TRANSFORMED)
1175                         mul_m4_v3(obi->mat, v4);
1176
1177                 if(occ_visible_quad(p, n, v1, v3, v4, q0, q1, q2, q3))
1178                         contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1179         }
1180
1181         return contrib;
1182 }
1183
1184 static void occ_lookup(OcclusionTree *tree, int thread, OccFace *exclude, float *pp, float *pn, float *occ, float rad[3], float bentn[3])
1185 {
1186         OccNode *node, **stack;
1187         OccFace *face;
1188         float resultocc, resultrad[3], v[3], p[3], n[3], co[3], invd2;
1189         float distfac, fac, error, d2, weight, emitarea;
1190         int b, f, totstack;
1191
1192         /* init variables */
1193         VECCOPY(p, pp);
1194         VECCOPY(n, pn);
1195         VECADDFAC(p, p, n, 1e-4f);
1196
1197         if(bentn)
1198                 copy_v3_v3(bentn, n);
1199         
1200         error= tree->error;
1201         distfac= tree->distfac;
1202
1203         resultocc= 0.0f;
1204         zero_v3(resultrad);
1205
1206         /* init stack */
1207         stack= tree->stack[thread];
1208         stack[0]= tree->root;
1209         totstack= 1;
1210
1211         while(totstack) {
1212                 /* pop point off the stack */
1213                 node= stack[--totstack];
1214
1215                 VECSUB(v, node->co, p);
1216                 d2= INPR(v, v) + 1e-16f;
1217                 emitarea= MAX2(node->area, node->dco);
1218
1219                 if(d2*error > emitarea) {
1220                         if(distfac != 0.0f) {
1221                                 fac= 1.0f/(1.0f + distfac*d2);
1222                                 if(fac < 0.01f)
1223                                         continue;
1224                         }
1225                         else
1226                                 fac= 1.0f;
1227
1228                         /* accumulate occlusion from spherical harmonics */
1229                         invd2 = 1.0f/sqrtf(d2);
1230                         weight= occ_solid_angle(node, v, d2, invd2, n);
1231
1232                         if(rad)
1233                                 madd_v3_v3fl(resultrad, node->rad, weight*fac);
1234
1235                         weight *= node->occlusion;
1236
1237                         if(bentn) {
1238                                 bentn[0] -= weight*invd2*v[0];
1239                                 bentn[1] -= weight*invd2*v[1];
1240                                 bentn[2] -= weight*invd2*v[2];
1241                         }
1242
1243                         resultocc += weight*fac;
1244                 }
1245                 else {
1246                         /* traverse into children */
1247                         for(b=0; b<TOTCHILD; b++) {
1248                                 if(node->childflag & (1<<b)) {
1249                                         f= node->child[b].face;
1250                                         face= &tree->face[f];
1251
1252                                         /* accumulate occlusion with face form factor */
1253                                         if(!exclude || !(face->obi == exclude->obi && face->facenr == exclude->facenr)) {
1254                                                 if(bentn || distfac != 0.0f) {
1255                                                         occ_face(face, co, NULL, NULL); 
1256                                                         VECSUB(v, co, p);
1257                                                         d2= INPR(v, v) + 1e-16f;
1258
1259                                                         fac= (distfac == 0.0f)? 1.0f: 1.0f/(1.0f + distfac*d2);
1260                                                         if(fac < 0.01f)
1261                                                                 continue;
1262                                                 }
1263                                                 else
1264                                                         fac= 1.0f;
1265
1266                                                 weight= occ_form_factor(face, p, n);
1267
1268                                                 if(rad)
1269                                                         madd_v3_v3fl(resultrad, tree->rad[f], weight*fac);
1270
1271                                                 weight *= tree->occlusion[f];
1272
1273                                                 if(bentn) {
1274                                                         invd2= 1.0f/sqrtf(d2);
1275                                                         bentn[0] -= weight*invd2*v[0];
1276                                                         bentn[1] -= weight*invd2*v[1];
1277                                                         bentn[2] -= weight*invd2*v[2];
1278                                                 }
1279
1280                                                 resultocc += weight*fac;
1281                                         }
1282                                 }
1283                                 else if(node->child[b].node) {
1284                                         /* push child on the stack */
1285                                         stack[totstack++]= node->child[b].node;
1286                                 }
1287                         }
1288                 }
1289         }
1290
1291         if(occ) *occ= resultocc;
1292         if(rad) copy_v3_v3(rad, resultrad);
1293         /*if(rad && exclude) {
1294                 int a;
1295                 for(a=0; a<tree->totface; a++)
1296                         if((tree->face[a].obi == exclude->obi && tree->face[a].facenr == exclude->facenr))
1297                                 copy_v3_v3(rad, tree->rad[a]);
1298         }*/
1299         if(bentn) normalize_v3(bentn);
1300 }
1301
1302 static void occ_compute_passes(Render *re, OcclusionTree *tree, int totpass)
1303 {
1304         float *occ, (*rad)[3]= NULL, co[3], n[3];
1305         int pass, i;
1306         
1307         occ= MEM_callocN(sizeof(float)*tree->totface, "OcclusionPassOcc");
1308         if(tree->rad)
1309                 rad= MEM_callocN(sizeof(float)*3*tree->totface, "OcclusionPassRad");
1310
1311         for(pass=0; pass<totpass; pass++) {
1312                 for(i=0; i<tree->totface; i++) {
1313                         occ_face(&tree->face[i], co, n, NULL);
1314                         negate_v3(n);
1315                         VECADDFAC(co, co, n, 1e-8f);
1316
1317                         occ_lookup(tree, 0, &tree->face[i], co, n, &occ[i], NULL, (rad)? rad[i]: NULL);
1318                         if(re->test_break(re->tbh))
1319                                 break;
1320                 }
1321
1322                 if(re->test_break(re->tbh))
1323                         break;
1324
1325                 for(i=0; i<tree->totface; i++) {
1326                         tree->occlusion[i] -= occ[i]; //MAX2(1.0f-occ[i], 0.0f);
1327                         if(tree->occlusion[i] < 0.0f)
1328                                 tree->occlusion[i]= 0.0f;
1329
1330                         if(rad) {
1331                                 sub_v3_v3(tree->rad[i], rad[i]);
1332
1333                                 if(tree->rad[i][0] < 0.0f)
1334                                         tree->rad[i][0]= 0.0f;
1335                                 if(tree->rad[i][1] < 0.0f)
1336                                         tree->rad[i][1]= 0.0f;
1337                                 if(tree->rad[i][2] < 0.0f)
1338                                         tree->rad[i][2]= 0.0f;
1339                         }
1340                 }
1341
1342                 occ_sum_occlusion(tree, tree->root);
1343         }
1344
1345         MEM_freeN(occ);
1346         if(rad)
1347                 MEM_freeN(rad);
1348 }
1349
1350 static void sample_occ_tree(Render *re, OcclusionTree *tree, OccFace *exclude, float *co, float *n, int thread, int onlyshadow, float *ao, float *indirect)
1351 {
1352         float nn[3], bn[3], fac, occ, occlusion, correction, rad[3];
1353         int aocolor, aorad;
1354
1355         aocolor= re->wrld.aocolor;
1356         if(onlyshadow)
1357                 aocolor= WO_AOPLAIN;
1358         aorad= (re->wrld.ao_indirect_energy != 0.0f);
1359
1360         VECCOPY(nn, n);
1361         negate_v3(nn);
1362
1363         occ_lookup(tree, thread, exclude, co, nn, &occ, (aorad)? rad: NULL, (aocolor)? bn: NULL);
1364
1365         correction= re->wrld.ao_approx_correction;
1366
1367         occlusion= (1.0f-correction)*(1.0f-occ);
1368         CLAMP(occlusion, 0.0f, 1.0f);
1369         if(correction != 0.0f)
1370                 occlusion += correction*exp(-occ);
1371
1372         if(aocolor) {
1373                 /* sky shading using bent normal */
1374                 if(ELEM(aocolor, WO_AOSKYCOL, WO_AOSKYTEX)) {
1375                         fac= 0.5*(1.0f+bn[0]*re->grvec[0]+ bn[1]*re->grvec[1]+ bn[2]*re->grvec[2]);
1376                         ao[0]= (1.0f-fac)*re->wrld.horr + fac*re->wrld.zenr;
1377                         ao[1]= (1.0f-fac)*re->wrld.horg + fac*re->wrld.zeng;
1378                         ao[2]= (1.0f-fac)*re->wrld.horb + fac*re->wrld.zenb;
1379                 }
1380 #if 0
1381                 else {  /* WO_AOSKYTEX */
1382                         float dxyview[3];
1383                         bn[0]= -bn[0];
1384                         bn[1]= -bn[1];
1385                         bn[2]= -bn[2];
1386                         dxyview[0]= 1.0f;
1387                         dxyview[1]= 1.0f;
1388                         dxyview[2]= 0.0f;
1389                         shadeSkyView(ao, co, bn, dxyview);
1390                 }
1391 #endif
1392
1393                 mul_v3_fl(ao, occlusion);
1394         }
1395         else {
1396                 ao[0]= occlusion;
1397                 ao[1]= occlusion;
1398                 ao[2]= occlusion;
1399         }
1400
1401         if(aorad) copy_v3_v3(indirect, rad);
1402         else zero_v3(indirect);
1403 }
1404
1405 /* ---------------------------- Caching ------------------------------- */
1406
1407 static OcclusionCacheSample *find_occ_sample(OcclusionCache *cache, int x, int y)
1408 {
1409         x -= cache->x;
1410         y -= cache->y;
1411
1412         x /= cache->step;
1413         y /= cache->step;
1414         x *= cache->step;
1415         y *= cache->step;
1416
1417         if(x < 0 || x >= cache->w || y < 0 || y >= cache->h)
1418                 return NULL;
1419         else
1420                 return &cache->sample[y*cache->w + x];
1421 }
1422
1423 static int sample_occ_cache(OcclusionTree *tree, float *co, float *n, int x, int y, int thread, float *ao, float *indirect)
1424 {
1425         OcclusionCache *cache;
1426         OcclusionCacheSample *samples[4], *sample;
1427         float wn[4], wz[4], wb[4], tx, ty, w, totw, mino, maxo;
1428         float d[3], dist2;
1429         int i, x1, y1, x2, y2;
1430
1431         if(!tree->cache)
1432                 return 0;
1433         
1434         /* first try to find a sample in the same pixel */
1435         cache= &tree->cache[thread];
1436
1437         if(cache->sample && cache->step) {
1438                 sample= &cache->sample[(y-cache->y)*cache->w + (x-cache->x)];
1439                 if(sample->filled) {
1440                         VECSUB(d, sample->co, co);
1441                         dist2= INPR(d, d);
1442                         if(dist2 < 0.5f*sample->dist2 && INPR(sample->n, n) > 0.98f) {
1443                                 VECCOPY(ao, sample->ao);
1444                                 VECCOPY(indirect, sample->indirect);
1445                                 return 1;
1446                         }
1447                 }
1448         }
1449         else
1450                 return 0;
1451
1452         /* try to interpolate between 4 neighbouring pixels */
1453         samples[0]= find_occ_sample(cache, x, y);
1454         samples[1]= find_occ_sample(cache, x+cache->step, y);
1455         samples[2]= find_occ_sample(cache, x, y+cache->step);
1456         samples[3]= find_occ_sample(cache, x+cache->step, y+cache->step);
1457
1458         for(i=0; i<4; i++)
1459                 if(!samples[i] || !samples[i]->filled)
1460                         return 0;
1461
1462         /* require intensities not being too different */
1463         mino= MIN4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1464         maxo= MAX4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1465
1466         if(maxo - mino > 0.05f)
1467                 return 0;
1468
1469         /* compute weighted interpolation between samples */
1470         zero_v3(ao);
1471         zero_v3(indirect);
1472         totw= 0.0f;
1473
1474         x1= samples[0]->x;
1475         y1= samples[0]->y;
1476         x2= samples[3]->x;
1477         y2= samples[3]->y;
1478
1479         tx= (float)(x2 - x)/(float)(x2 - x1);
1480         ty= (float)(y2 - y)/(float)(y2 - y1);
1481
1482         wb[3]= (1.0f-tx)*(1.0f-ty);
1483         wb[2]= (tx)*(1.0f-ty);
1484         wb[1]= (1.0f-tx)*(ty);
1485         wb[0]= tx*ty;
1486
1487         for(i=0; i<4; i++) {
1488                 VECSUB(d, samples[i]->co, co);
1489                 dist2= INPR(d, d);
1490
1491                 wz[i]= 1.0f; //(samples[i]->dist2/(1e-4f + dist2));
1492                 wn[i]= pow(INPR(samples[i]->n, n), 32.0f);
1493
1494                 w= wb[i]*wn[i]*wz[i];
1495
1496                 totw += w;
1497                 madd_v3_v3fl(ao, samples[i]->ao, w);
1498                 madd_v3_v3fl(indirect, samples[i]->indirect, w);
1499         }
1500
1501         if(totw >= 0.9f) {
1502                 totw= 1.0f/totw;
1503                 mul_v3_fl(ao, totw);
1504                 mul_v3_fl(indirect, totw);
1505                 return 1;
1506         }
1507
1508         return 0;
1509 }
1510
1511 static void sample_occ_surface(ShadeInput *shi)
1512 {
1513         StrandRen *strand= shi->strand;
1514         StrandSurface *mesh= strand->buffer->surface;
1515         int *face, *index = RE_strandren_get_face(shi->obr, strand, 0);
1516         float w[4], *co1, *co2, *co3, *co4;
1517
1518         if(mesh && mesh->face && mesh->co && mesh->ao && index) {
1519                 face= mesh->face[*index];
1520
1521                 co1= mesh->co[face[0]];
1522                 co2= mesh->co[face[1]];
1523                 co3= mesh->co[face[2]];
1524                 co4= (face[3])? mesh->co[face[3]]: NULL;
1525
1526                 interp_weights_face_v3(w, co1, co2, co3, co4, strand->vert->co);
1527
1528                 zero_v3(shi->ao);
1529                 zero_v3(shi->indirect);
1530
1531                 madd_v3_v3fl(shi->ao, mesh->ao[face[0]], w[0]);
1532                 madd_v3_v3fl(shi->indirect, mesh->indirect[face[0]], w[0]);
1533                 madd_v3_v3fl(shi->ao, mesh->ao[face[1]], w[1]);
1534                 madd_v3_v3fl(shi->indirect, mesh->indirect[face[1]], w[1]);
1535                 madd_v3_v3fl(shi->ao, mesh->ao[face[2]], w[2]);
1536                 madd_v3_v3fl(shi->indirect, mesh->indirect[face[2]], w[2]);
1537                 if(face[3]) {
1538                         madd_v3_v3fl(shi->ao, mesh->ao[face[3]], w[3]);
1539                         madd_v3_v3fl(shi->indirect, mesh->indirect[face[3]], w[3]);
1540                 }
1541         }
1542         else {
1543                 shi->ao[0]= 1.0f;
1544                 shi->ao[1]= 1.0f;
1545                 shi->ao[2]= 1.0f;
1546                 zero_v3(shi->indirect);
1547         }
1548 }
1549
1550 /* ------------------------- External Functions --------------------------- */
1551
1552 static void *exec_strandsurface_sample(void *data)
1553 {
1554         OcclusionThread *othread= (OcclusionThread*)data;
1555         Render *re= othread->re;
1556         StrandSurface *mesh= othread->mesh;
1557         float ao[3], indirect[3], co[3], n[3], *co1, *co2, *co3, *co4;
1558         int a, *face;
1559
1560         for(a=othread->begin; a<othread->end; a++) {
1561                 face= mesh->face[a];
1562                 co1= mesh->co[face[0]];
1563                 co2= mesh->co[face[1]];
1564                 co3= mesh->co[face[2]];
1565
1566                 if(face[3]) {
1567                         co4= mesh->co[face[3]];
1568
1569                         interp_v3_v3v3(co, co1, co3, 0.5f);
1570                         normal_quad_v3( n,co1, co2, co3, co4);
1571                 }
1572                 else {
1573                         cent_tri_v3(co, co1, co2, co3);
1574                         normal_tri_v3( n,co1, co2, co3);
1575                 }
1576                 negate_v3(n);
1577
1578                 sample_occ_tree(re, re->occlusiontree, NULL, co, n, othread->thread, 0, ao, indirect);
1579                 VECCOPY(othread->faceao[a], ao);
1580                 VECCOPY(othread->faceindirect[a], indirect);
1581         }
1582
1583         return 0;
1584 }
1585
1586 void make_occ_tree(Render *re)
1587 {
1588         OcclusionThread othreads[BLENDER_MAX_THREADS];
1589         StrandSurface *mesh;
1590         ListBase threads;
1591         float ao[3], indirect[3], (*faceao)[3], (*faceindirect)[3];
1592         int a, totface, totthread, *face, *count;
1593
1594         /* ugly, needed for occ_face */
1595         R= *re;
1596
1597         re->i.infostr= "Occlusion preprocessing";
1598         re->stats_draw(re->sdh, &re->i);
1599         
1600         re->occlusiontree= occ_tree_build(re);
1601         
1602         if(re->occlusiontree) {
1603                 if(re->wrld.ao_approx_passes)
1604                         occ_compute_passes(re, re->occlusiontree, re->wrld.ao_approx_passes);
1605
1606                 for(mesh=re->strandsurface.first; mesh; mesh=mesh->next) {
1607                         if(!mesh->face || !mesh->co || !mesh->ao)
1608                                 continue;
1609
1610                         count= MEM_callocN(sizeof(int)*mesh->totvert, "OcclusionCount");
1611                         faceao= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceAO");
1612                         faceindirect= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceIndirect");
1613
1614                         totthread= (mesh->totface > 10000)? re->r.threads: 1;
1615                         totface= mesh->totface/totthread;
1616                         for(a=0; a<totthread; a++) {
1617                                 othreads[a].re= re;
1618                                 othreads[a].faceao= faceao;
1619                                 othreads[a].faceindirect= faceindirect;
1620                                 othreads[a].thread= a;
1621                                 othreads[a].mesh= mesh;
1622                                 othreads[a].begin= a*totface;
1623                                 othreads[a].end= (a == totthread-1)? mesh->totface: (a+1)*totface;
1624                         }
1625
1626                         if(totthread == 1) {
1627                                 exec_strandsurface_sample(&othreads[0]);
1628                         }
1629                         else {
1630                                 BLI_init_threads(&threads, exec_strandsurface_sample, totthread);
1631
1632                                 for(a=0; a<totthread; a++)
1633                                         BLI_insert_thread(&threads, &othreads[a]);
1634
1635                                 BLI_end_threads(&threads);
1636                         }
1637
1638                         for(a=0; a<mesh->totface; a++) {
1639                                 face= mesh->face[a];
1640
1641                                 VECCOPY(ao, faceao[a]);
1642                                 VECCOPY(indirect, faceindirect[a]);
1643
1644                                 VECADD(mesh->ao[face[0]], mesh->ao[face[0]], ao);
1645                                 VECADD(mesh->indirect[face[0]], mesh->indirect[face[0]], indirect);
1646                                 count[face[0]]++;
1647                                 VECADD(mesh->ao[face[1]], mesh->ao[face[1]], ao);
1648                                 VECADD(mesh->indirect[face[1]], mesh->indirect[face[1]], indirect);
1649                                 count[face[1]]++;
1650                                 VECADD(mesh->ao[face[2]], mesh->ao[face[2]], ao);
1651                                 VECADD(mesh->indirect[face[2]], mesh->indirect[face[2]], indirect);
1652                                 count[face[2]]++;
1653
1654                                 if(face[3]) {
1655                                         VECADD(mesh->ao[face[3]], mesh->ao[face[3]], ao);
1656                                         VECADD(mesh->indirect[face[3]], mesh->indirect[face[3]], indirect);
1657                                         count[face[3]]++;
1658                                 }
1659                         }
1660
1661                         for(a=0; a<mesh->totvert; a++) {
1662                                 if(count[a]) {
1663                                         mul_v3_fl(mesh->ao[a], 1.0f/count[a]);
1664                                         mul_v3_fl(mesh->indirect[a], 1.0f/count[a]);
1665                                 }
1666                         }
1667
1668                         MEM_freeN(count);
1669                         MEM_freeN(faceao);
1670                         MEM_freeN(faceindirect);
1671                 }
1672         }
1673 }
1674
1675 void free_occ(Render *re)
1676 {
1677         if(re->occlusiontree) {
1678                 occ_free_tree(re->occlusiontree);
1679                 re->occlusiontree = NULL;
1680         }
1681 }
1682
1683 void sample_occ(Render *re, ShadeInput *shi)
1684 {
1685         OcclusionTree *tree= re->occlusiontree;
1686         OcclusionCache *cache;
1687         OcclusionCacheSample *sample;
1688         OccFace exclude;
1689         int onlyshadow;
1690
1691         if(tree) {
1692                 if(shi->strand) {
1693                         sample_occ_surface(shi);
1694                 }
1695                 /* try to get result from the cache if possible */
1696                 else if(shi->depth!=0 || !sample_occ_cache(tree, shi->co, shi->vno, shi->xs, shi->ys, shi->thread, shi->ao, shi->indirect)) {
1697                         /* no luck, let's sample the occlusion */
1698                         exclude.obi= shi->obi - re->objectinstance;
1699                         exclude.facenr= shi->vlr->index;
1700                         onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1701                         sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao, shi->indirect);
1702
1703                         /* fill result into sample, each time */
1704                         if(tree->cache) {
1705                                 cache= &tree->cache[shi->thread];
1706
1707                                 if(cache->sample && cache->step) {
1708                                         sample= &cache->sample[(shi->ys-cache->y)*cache->w + (shi->xs-cache->x)];
1709                                         VECCOPY(sample->co, shi->co);
1710                                         VECCOPY(sample->n, shi->vno);
1711                                         VECCOPY(sample->ao, shi->ao);
1712                                         VECCOPY(sample->indirect, shi->indirect);
1713                                         sample->intensity= MAX3(sample->ao[0], sample->ao[1], sample->ao[2]);
1714                                         sample->intensity= MAX2(sample->intensity, MAX3(sample->indirect[0], sample->indirect[1], sample->indirect[2]));
1715                                         sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1716                                         sample->filled= 1;
1717                                 }
1718                         }
1719                 }
1720         }
1721         else {
1722                 shi->ao[0]= 1.0f;
1723                 shi->ao[1]= 1.0f;
1724                 shi->ao[2]= 1.0f;
1725
1726                 shi->indirect[0]= 0.0f;
1727                 shi->indirect[1]= 0.0f;
1728                 shi->indirect[2]= 0.0f;
1729         }
1730 }
1731
1732 void cache_occ_samples(Render *re, RenderPart *pa, ShadeSample *ssamp)
1733 {
1734         OcclusionTree *tree= re->occlusiontree;
1735         PixStr ps;
1736         OcclusionCache *cache;
1737         OcclusionCacheSample *sample;
1738         OccFace exclude;
1739         ShadeInput *shi;
1740         intptr_t *rd=NULL;
1741         int *ro=NULL, *rp=NULL, *rz=NULL, onlyshadow;
1742         int x, y, step = CACHE_STEP;
1743
1744         if(!tree->cache)
1745                 return;
1746
1747         cache= &tree->cache[pa->thread];
1748         cache->w= pa->rectx;
1749         cache->h= pa->recty;
1750         cache->x= pa->disprect.xmin;
1751         cache->y= pa->disprect.ymin;
1752         cache->step= step;
1753         cache->sample= MEM_callocN(sizeof(OcclusionCacheSample)*cache->w*cache->h, "OcclusionCacheSample");
1754         sample= cache->sample;
1755
1756         if(re->osa) {
1757                 rd= pa->rectdaps;
1758         }
1759         else {
1760                 /* fake pixel struct for non-osa */
1761                 ps.next= NULL;
1762                 ps.mask= 0xFFFF;
1763
1764                 ro= pa->recto;
1765                 rp= pa->rectp;
1766                 rz= pa->rectz;
1767         }
1768
1769         /* compute a sample at every step pixels */
1770         for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++) {
1771                 for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, sample++, rd++, ro++, rp++, rz++) {
1772                         if(!(((x - pa->disprect.xmin + step) % step) == 0 || x == pa->disprect.xmax-1))
1773                                 continue;
1774                         if(!(((y - pa->disprect.ymin + step) % step) == 0 || y == pa->disprect.ymax-1))
1775                                 continue;
1776
1777                         if(re->osa) {
1778                                 if(!*rd) continue;
1779
1780                                 shade_samples_fill_with_ps(ssamp, (PixStr *)(*rd), x, y);
1781                         }
1782                         else {
1783                                 if(!*rp) continue;
1784
1785                                 ps.obi= *ro;
1786                                 ps.facenr= *rp;
1787                                 ps.z= *rz;
1788                                 shade_samples_fill_with_ps(ssamp, &ps, x, y);
1789                         }
1790
1791                         shi= ssamp->shi;
1792                         if(shi->vlr) {
1793                                 onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1794                                 exclude.obi= shi->obi - re->objectinstance;
1795                                 exclude.facenr= shi->vlr->index;
1796                                 sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao, shi->indirect);
1797
1798                                 VECCOPY(sample->co, shi->co);
1799                                 VECCOPY(sample->n, shi->vno);
1800                                 VECCOPY(sample->ao, shi->ao);
1801                                 VECCOPY(sample->indirect, shi->indirect);
1802                                 sample->intensity= MAX3(sample->ao[0], sample->ao[1], sample->ao[2]);
1803                                 sample->intensity= MAX2(sample->intensity, MAX3(sample->indirect[0], sample->indirect[1], sample->indirect[2]));
1804                                 sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1805                                 sample->x= shi->xs;
1806                                 sample->y= shi->ys;
1807                                 sample->filled= 1;
1808                         }
1809
1810                         if(re->test_break(re->tbh))
1811                                 break;
1812                 }
1813         }
1814 }
1815
1816 void free_occ_samples(Render *re, RenderPart *pa)
1817 {
1818         OcclusionTree *tree= re->occlusiontree;
1819         OcclusionCache *cache;
1820
1821         if(tree->cache) {
1822                 cache= &tree->cache[pa->thread];
1823
1824                 if(cache->sample)
1825                         MEM_freeN(cache->sample);
1826
1827                 cache->w= 0;
1828                 cache->h= 0;
1829                 cache->step= 0;
1830         }
1831 }
1832