AAO Indirect Diffuse
[blender.git] / source / blender / render / intern / source / occlusion.c
1 /* 
2  * $Id$
3  *
4  * ***** BEGIN GPL LICENSE BLOCK *****
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software Foundation,
18  * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19  *
20  * The Original Code is Copyright (C) 2008 Blender Foundation.
21  * All rights reserved.
22  *
23  * The Original Code is: all of this file.
24  *
25  * Contributor(s): Brecht Van Lommel.
26  *
27  * ***** END GPL LICENSE BLOCK *****
28  */
29
30 #include <math.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "MEM_guardedalloc.h"
36
37 #include "DNA_material_types.h"
38
39 #include "BLI_math.h"
40 #include "BLI_blenlib.h"
41 #include "BLI_memarena.h"
42 #include "BLI_threads.h"
43
44 #include "BKE_global.h"
45 #include "BKE_scene.h"
46 #include "BKE_utildefines.h"
47
48 #include "RE_shader_ext.h"
49
50 /* local includes */
51 #include "occlusion.h"
52 #include "render_types.h"
53 #include "rendercore.h"
54 #include "renderdatabase.h"
55 #include "pixelshading.h"
56 #include "shading.h"
57 #include "zbuf.h"
58
59 /* ------------------------- Declarations --------------------------- */
60
61 #define INVALID_INDEX ((int)(~0))
62 #define INVPI 0.31830988618379069f
63 #define TOTCHILD 8
64 #define CACHE_STEP 3
65
66 typedef struct OcclusionCacheSample {
67         float co[3], n[3], ao[3], indirect[3], intensity, dist2;
68         int x, y, filled;
69 } OcclusionCacheSample;
70
71 typedef struct OcclusionCache {
72         OcclusionCacheSample *sample;
73         int x, y, w, h, step;
74 } OcclusionCache;
75
76 typedef struct OccFace {
77         int obi;
78         int facenr;
79 } OccFace;
80
81 typedef struct OccNode {
82         float co[3], area;
83         float sh[9], dco;
84         float occlusion, rad[3];
85         int childflag;
86         union {
87                 //OccFace face;
88                 int face;
89                 struct OccNode *node;
90         } child[TOTCHILD];
91 } OccNode;
92
93 typedef struct OcclusionTree {
94         MemArena *arena;
95
96         float (*co)[3];         /* temporary during build */
97
98         OccFace *face;          /* instance and face indices */
99         float *occlusion;       /* occlusion for faces */
100         float (*rad)[3];        /* radiance for faces */
101         
102         OccNode *root;
103
104         OccNode **stack[BLENDER_MAX_THREADS];
105         int maxdepth;
106
107         int totface;
108
109         float error;
110         float distfac;
111
112         int dothreadedbuild;
113         int totbuildthread;
114         int doindirect;
115
116         OcclusionCache *cache;
117 } OcclusionTree;
118
119 typedef struct OcclusionThread {
120         Render *re;
121         StrandSurface *mesh;
122         float (*faceao)[3];
123         float (*faceindirect)[3];
124         int begin, end;
125         int thread;
126 } OcclusionThread;
127
128 typedef struct OcclusionBuildThread {
129         OcclusionTree *tree;
130         int begin, end, depth;
131         OccNode *node;
132 } OcclusionBuildThread;
133
134 /* ------------------------- Shading --------------------------- */
135
136 extern Render R; // meh
137
138 static void occ_shade(ShadeSample *ssamp, ObjectInstanceRen *obi, VlakRen *vlr, float *rad)
139 {
140         ShadeInput *shi= ssamp->shi;
141         ShadeResult *shr= ssamp->shr;
142         float l, u, v, *v1, *v2, *v3;
143         
144         /* init */
145         if(vlr->v4) {
146                 shi->u= u= 0.5f;
147                 shi->v= v= 0.5f;
148         }
149         else {
150                 shi->u= u= 1.0f/3.0f;
151                 shi->v= v= 1.0f/3.0f;
152         }
153
154         /* setup render coordinates */
155         v1= vlr->v1->co;
156         v2= vlr->v2->co;
157         v3= vlr->v3->co;
158         
159         /* renderco */
160         l= 1.0f-u-v;
161         
162         shi->co[0]= l*v3[0]+u*v1[0]+v*v2[0];
163         shi->co[1]= l*v3[1]+u*v1[1]+v*v2[1];
164         shi->co[2]= l*v3[2]+u*v1[2]+v*v2[2];
165
166         shade_input_set_triangle_i(shi, obi, vlr, 0, 1, 2);
167
168         /* set up view vector */
169         VECCOPY(shi->view, shi->co);
170         normalize_v3(shi->view);
171         
172         /* cache for shadow */
173         shi->samplenr++;
174         
175         shi->xs= 0; // TODO
176         shi->ys= 0;
177         
178         shade_input_set_normals(shi);
179
180         /* no normal flip */
181         if(shi->flippednor)
182                 shade_input_flip_normals(shi);
183
184         madd_v3_v3fl(shi->co, shi->vn, 0.0001f); /* ugly.. */
185
186         /* not a pretty solution, but fixes common cases */
187         if(shi->obr->ob && shi->obr->ob->transflag & OB_NEG_SCALE) {
188                 negate_v3(shi->vn);
189                 negate_v3(shi->vno);
190         }
191
192         /* init material vars */
193         // note, keep this synced with render_types.h
194         memcpy(&shi->r, &shi->mat->r, 23*sizeof(float));
195         shi->har= shi->mat->har;
196         
197         /* render */
198         shade_input_set_shade_texco(shi);
199         shade_material_loop(shi, shr); /* todo: nodes */
200         
201         VECCOPY(rad, shr->combined);
202 }
203
204 static void occ_build_shade(Render *re, OcclusionTree *tree)
205 {
206         ShadeSample ssamp;
207         ObjectInstanceRen *obi;
208         VlakRen *vlr;
209         int a;
210
211         R= *re;
212
213         /* setup shade sample with correct passes */
214         memset(&ssamp, 0, sizeof(ShadeSample));
215         ssamp.shi[0].lay= re->scene->lay;
216         ssamp.shi[0].passflag= SCE_PASS_DIFFUSE|SCE_PASS_RGBA;
217         ssamp.shi[0].combinedflag= ~(SCE_PASS_SPEC);
218         ssamp.tot= 1;
219
220         for(a=0; a<tree->totface; a++) {
221                 obi= &R.objectinstance[tree->face[a].obi];
222                 vlr= RE_findOrAddVlak(obi->obr, tree->face[a].facenr);
223
224                 occ_shade(&ssamp, obi, vlr, tree->rad[a]);
225         }
226 }
227
228 /* ------------------------- Spherical Harmonics --------------------------- */
229
230 /* Use 2nd order SH => 9 coefficients, stored in this order:
231    0 = (0,0),
232    1 = (1,-1), 2 = (1,0), 3 = (1,1),
233    4 = (2,-2), 5 = (2,-1), 6 = (2,0), 7 = (2,1), 8 = (2,2) */
234
235 static void sh_copy(float *shresult, float *sh)
236 {
237         memcpy(shresult, sh, sizeof(float)*9);
238 }
239
240 static void sh_mul(float *sh, float f)
241 {
242         int i;
243
244         for(i=0; i<9; i++)
245                 sh[i] *= f;
246 }
247
248 static void sh_add(float *shresult, float *sh1, float *sh2)
249 {
250         int i;
251
252         for(i=0; i<9; i++)
253                 shresult[i]= sh1[i] + sh2[i];
254 }
255
256 static void sh_from_disc(float *n, float area, float *shresult)
257 {
258         /* See formula (3) in:
259            "An Efficient Representation for Irradiance Environment Maps" */
260         float sh[9], x, y, z;
261
262         x= n[0];
263         y= n[1];
264         z= n[2];
265
266         sh[0]= 0.282095f;
267
268         sh[1]= 0.488603f*y;
269         sh[2]= 0.488603f*z;
270         sh[3]= 0.488603f*x;
271         
272         sh[4]= 1.092548f*x*y;
273         sh[5]= 1.092548f*y*z;
274         sh[6]= 0.315392f*(3.0f*z*z - 1.0f);
275         sh[7]= 1.092548f*x*z;
276         sh[8]= 0.546274f*(x*x - y*y);
277
278         sh_mul(sh, area);
279         sh_copy(shresult, sh);
280 }
281
282 static float sh_eval(float *sh, float *v)
283 {
284         /* See formula (13) in:
285            "An Efficient Representation for Irradiance Environment Maps" */
286         static const float c1 = 0.429043f, c2 = 0.511664f, c3 = 0.743125f;
287         static const float c4 = 0.886227f, c5 = 0.247708f;
288         float x, y, z, sum;
289
290         x= v[0];
291         y= v[1];
292         z= v[2];
293
294         sum= c1*sh[8]*(x*x - y*y);
295         sum += c3*sh[6]*z*z;
296         sum += c4*sh[0];
297         sum += -c5*sh[6];
298         sum += 2.0f*c1*(sh[4]*x*y + sh[7]*x*z + sh[5]*y*z);
299         sum += 2.0f*c2*(sh[3]*x + sh[1]*y + sh[2]*z);
300
301         return sum;
302 }
303
304 /* ------------------------------ Building --------------------------------- */
305
306 static void occ_face(const OccFace *face, float *co, float *normal, float *area)
307 {
308         ObjectInstanceRen *obi;
309         VlakRen *vlr;
310         float v1[3], v2[3], v3[3], v4[3];
311
312         obi= &R.objectinstance[face->obi];
313         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
314         
315         if(co) {
316                 if(vlr->v4)
317                         interp_v3_v3v3(co, vlr->v1->co, vlr->v3->co, 0.5f);
318                 else
319                         cent_tri_v3(co, vlr->v1->co, vlr->v2->co, vlr->v3->co);
320
321                 if(obi->flag & R_TRANSFORMED)
322                         mul_m4_v3(obi->mat, co);
323         }
324         
325         if(normal) {
326                 normal[0]= -vlr->n[0];
327                 normal[1]= -vlr->n[1];
328                 normal[2]= -vlr->n[2];
329
330                 if(obi->flag & R_TRANSFORMED)
331                         mul_m3_v3(obi->nmat, normal);
332         }
333
334         if(area) {
335                 VECCOPY(v1, vlr->v1->co);
336                 VECCOPY(v2, vlr->v2->co);
337                 VECCOPY(v3, vlr->v3->co);
338                 if(vlr->v4) VECCOPY(v4, vlr->v4->co);
339
340                 if(obi->flag & R_TRANSFORMED) {
341                         mul_m4_v3(obi->mat, v1);
342                         mul_m4_v3(obi->mat, v2);
343                         mul_m4_v3(obi->mat, v3);
344                         if(vlr->v4) mul_m4_v3(obi->mat, v4);
345                 }
346
347                 /* todo: correct area for instances */
348                 if(vlr->v4)
349                         *area= area_quad_v3(v1, v2, v3, v4);
350                 else
351                         *area= area_tri_v3(v1, v2, v3);
352         }
353 }
354
355 static void occ_sum_occlusion(OcclusionTree *tree, OccNode *node)
356 {
357         OccNode *child;
358         float occ, area, totarea, rad[3];
359         int a, b;
360
361         occ= 0.0f;
362         totarea= 0.0f;
363         zero_v3(rad);
364
365         for(b=0; b<TOTCHILD; b++) {
366                 if(node->childflag & (1<<b)) {
367                         a= node->child[b].face;
368                         occ_face(&tree->face[a], 0, 0, &area);
369                         occ += area*tree->occlusion[a];
370                         madd_v3_v3fl(rad, tree->rad[a], area);
371                         totarea += area;
372                 }
373                 else if(node->child[b].node) {
374                         child= node->child[b].node;
375                         occ_sum_occlusion(tree, child);
376
377                         occ += child->area*child->occlusion;
378                         madd_v3_v3fl(rad, child->rad, child->area);
379                         totarea += child->area;
380                 }
381         }
382
383         if(totarea != 0.0f) {
384                 occ /= totarea;
385                 mul_v3_fl(rad, 1.0f/totarea);
386         }
387         
388         node->occlusion= occ;
389         copy_v3_v3(node->rad, rad);
390 }
391
392 static int occ_find_bbox_axis(OcclusionTree *tree, int begin, int end, float *min, float *max)
393 {
394         float len, maxlen= -1.0f;
395         int a, axis = 0;
396
397         INIT_MINMAX(min, max);
398
399         for(a=begin; a<end; a++)
400                 DO_MINMAX(tree->co[a], min, max)
401
402         for(a=0; a<3; a++) {
403                 len= max[a] - min[a];
404
405                 if(len > maxlen) {
406                         maxlen= len;
407                         axis= a;
408                 }
409         }
410
411         return axis;
412 }
413
414 static void occ_node_from_face(OccFace *face, OccNode *node)
415 {
416         float n[3];
417
418         occ_face(face, node->co, n, &node->area);
419         node->dco= 0.0f;
420         sh_from_disc(n, node->area, node->sh);
421 }
422
423 static void occ_build_dco(OcclusionTree *tree, OccNode *node, float *co, float *dco)
424 {
425         OccNode *child;
426         float dist, d[3], nco[3];
427         int b;
428
429         for(b=0; b<TOTCHILD; b++) {
430                 if(node->childflag & (1<<b)) {
431                         occ_face(tree->face+node->child[b].face, nco, 0, 0);
432                 }
433                 else if(node->child[b].node) {
434                         child= node->child[b].node;
435                         occ_build_dco(tree, child, co, dco);
436                         VECCOPY(nco, child->co);
437                 }
438
439                 VECSUB(d, nco, co);
440                 dist= INPR(d, d);
441                 if(dist > *dco)
442                         *dco= dist;
443         }
444 }
445
446 static void occ_build_split(OcclusionTree *tree, int begin, int end, int *split)
447 {
448         float min[3], max[3], mid;
449         int axis, a, enda;
450
451         /* split in middle of boundbox. this seems faster than median split
452          * on complex scenes, possibly since it avoids two distant faces to
453          * be in the same node better? */
454         axis= occ_find_bbox_axis(tree, begin, end, min, max);
455         mid= 0.5f*(min[axis]+max[axis]);
456
457         a= begin;
458         enda= end;
459         while(a<enda) {
460                 if(tree->co[a][axis] > mid) {
461                         enda--;
462                         SWAP(OccFace, tree->face[a], tree->face[enda]);
463                         SWAP(float, tree->co[a][0], tree->co[enda][0]);
464                         SWAP(float, tree->co[a][1], tree->co[enda][1]);
465                         SWAP(float, tree->co[a][2], tree->co[enda][2]);
466                 }
467                 else
468                         a++;
469         }
470
471         *split= enda;
472 }
473
474 static void occ_build_8_split(OcclusionTree *tree, int begin, int end, int *offset, int *count)
475 {
476         /* split faces into eight groups */
477         int b, splitx, splity[2], splitz[4];
478
479         occ_build_split(tree, begin, end, &splitx);
480
481         /* force split if none found, to deal with degenerate geometry */
482         if(splitx == begin || splitx == end)
483                 splitx= (begin+end)/2;
484
485         occ_build_split(tree, begin, splitx, &splity[0]);
486         occ_build_split(tree, splitx, end, &splity[1]);
487
488         occ_build_split(tree, begin, splity[0], &splitz[0]);
489         occ_build_split(tree, splity[0], splitx, &splitz[1]);
490         occ_build_split(tree, splitx, splity[1], &splitz[2]);
491         occ_build_split(tree, splity[1], end, &splitz[3]);
492
493         offset[0]= begin;
494         offset[1]= splitz[0];
495         offset[2]= splity[0];
496         offset[3]= splitz[1];
497         offset[4]= splitx;
498         offset[5]= splitz[2];
499         offset[6]= splity[1];
500         offset[7]= splitz[3];
501
502         for(b=0; b<7; b++)
503                 count[b]= offset[b+1] - offset[b];
504         count[7]= end - offset[7];
505 }
506
507 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth);
508
509 static void *exec_occ_build(void *data)
510 {
511         OcclusionBuildThread *othread= (OcclusionBuildThread*)data;
512
513         occ_build_recursive(othread->tree, othread->node, othread->begin, othread->end, othread->depth);
514
515         return 0;
516 }
517
518 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth)
519 {
520         ListBase threads;
521         OcclusionBuildThread othreads[BLENDER_MAX_THREADS];
522         OccNode *child, tmpnode;
523         OccFace *face;
524         int a, b, totthread=0, offset[TOTCHILD], count[TOTCHILD];
525
526         /* add a new node */
527         node->occlusion= 1.0f;
528
529         /* leaf node with only children */
530         if(end - begin <= TOTCHILD) {
531                 for(a=begin, b=0; a<end; a++, b++) {
532                         face= &tree->face[a];
533                         node->child[b].face= a;
534                         node->childflag |= (1<<b);
535                 }
536         }
537         else {
538                 /* order faces */
539                 occ_build_8_split(tree, begin, end, offset, count);
540
541                 if(depth == 1 && tree->dothreadedbuild)
542                         BLI_init_threads(&threads, exec_occ_build, tree->totbuildthread);
543
544                 for(b=0; b<TOTCHILD; b++) {
545                         if(count[b] == 0) {
546                                 node->child[b].node= NULL;
547                         }
548                         else if(count[b] == 1) {
549                                 face= &tree->face[offset[b]];
550                                 node->child[b].face= offset[b];
551                                 node->childflag |= (1<<b);
552                         }
553                         else {
554                                 if(tree->dothreadedbuild)
555                                         BLI_lock_thread(LOCK_CUSTOM1);
556
557                                 child= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
558                                 node->child[b].node= child;
559
560                                 /* keep track of maximum depth for stack */
561                                 if(depth+1 > tree->maxdepth)
562                                         tree->maxdepth= depth+1;
563
564                                 if(tree->dothreadedbuild)
565                                         BLI_unlock_thread(LOCK_CUSTOM1);
566
567                                 if(depth == 1 && tree->dothreadedbuild) {
568                                         othreads[totthread].tree= tree;
569                                         othreads[totthread].node= child;
570                                         othreads[totthread].begin= offset[b];
571                                         othreads[totthread].end= offset[b]+count[b];
572                                         othreads[totthread].depth= depth+1;
573                                         BLI_insert_thread(&threads, &othreads[totthread]);
574                                         totthread++;
575                                 }
576                                 else
577                                         occ_build_recursive(tree, child, offset[b], offset[b]+count[b], depth+1);
578                         }
579                 }
580
581                 if(depth == 1 && tree->dothreadedbuild)
582                         BLI_end_threads(&threads);
583         }
584
585         /* combine area, position and sh */
586         for(b=0; b<TOTCHILD; b++) {
587                 if(node->childflag & (1<<b)) {
588                         child= &tmpnode;
589                         occ_node_from_face(tree->face+node->child[b].face, &tmpnode);
590                 }
591                 else {
592                         child= node->child[b].node;
593                 }
594
595                 if(child) {
596                         node->area += child->area;
597                         sh_add(node->sh, node->sh, child->sh);
598                         VECADDFAC(node->co, node->co, child->co, child->area);
599                 }
600         }
601
602         if(node->area != 0.0f)
603                 mul_v3_fl(node->co, 1.0f/node->area);
604
605         /* compute maximum distance from center */
606         node->dco= 0.0f;
607         occ_build_dco(tree, node, node->co, &node->dco);
608 }
609
610 static void occ_build_sh_normalize(OccNode *node)
611 {
612         /* normalize spherical harmonics to not include area, so
613          * we can clamp the dot product and then mutliply by area */
614         int b;
615
616         if(node->area != 0.0f)
617                 sh_mul(node->sh, 1.0f/node->area);
618
619         for(b=0; b<TOTCHILD; b++) {
620                 if(node->childflag & (1<<b));
621                 else if(node->child[b].node)
622                         occ_build_sh_normalize(node->child[b].node);
623         }
624 }
625
626 static OcclusionTree *occ_tree_build(Render *re)
627 {
628         OcclusionTree *tree;
629         ObjectInstanceRen *obi;
630         ObjectRen *obr;
631         VlakRen *vlr= NULL;
632         int a, b, c, totface;
633
634         /* count */
635         totface= 0;
636         for(obi=re->instancetable.first; obi; obi=obi->next) {
637                 obr= obi->obr;
638                 for(a=0; a<obr->totvlak; a++) {
639                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
640                         else vlr++;
641
642                         if((vlr->mat->mode & MA_TRACEBLE) && (vlr->mat->material_type == MA_TYPE_SURFACE))
643                                 totface++;
644                 }
645         }
646
647         if(totface == 0)
648                 return NULL;
649         
650         tree= MEM_callocN(sizeof(OcclusionTree), "OcclusionTree");
651         tree->totface= totface;
652
653         /* parameters */
654         tree->error= get_render_aosss_error(&re->r, re->wrld.ao_approx_error);
655         tree->distfac= (re->wrld.aomode & WO_AODIST)? re->wrld.aodistfac: 0.0f;
656         tree->doindirect= (re->wrld.ao_indirect_energy > 0.0f && re->wrld.ao_indirect_bounces > 0);
657
658         /* allocation */
659         tree->arena= BLI_memarena_new(0x8000 * sizeof(OccNode));
660         BLI_memarena_use_calloc(tree->arena);
661
662         if(re->wrld.aomode & WO_AOCACHE)
663                 tree->cache= MEM_callocN(sizeof(OcclusionCache)*BLENDER_MAX_THREADS, "OcclusionCache");
664
665         tree->face= MEM_callocN(sizeof(OccFace)*totface, "OcclusionFace");
666         tree->co= MEM_callocN(sizeof(float)*3*totface, "OcclusionCo");
667         tree->occlusion= MEM_callocN(sizeof(float)*totface, "OcclusionOcclusion");
668
669         if(tree->doindirect)
670                 tree->rad= MEM_callocN(sizeof(float)*3*totface, "OcclusionRad");
671
672         /* make array of face pointers */
673         for(b=0, c=0, obi=re->instancetable.first; obi; obi=obi->next, c++) {
674                 obr= obi->obr;
675                 for(a=0; a<obr->totvlak; a++) {
676                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
677                         else vlr++;
678
679                         if((vlr->mat->mode & MA_TRACEBLE) && (vlr->mat->material_type == MA_TYPE_SURFACE)) {
680                                 tree->face[b].obi= c;
681                                 tree->face[b].facenr= a;
682                                 tree->occlusion[b]= 1.0f;
683                                 occ_face(&tree->face[b], tree->co[b], NULL, NULL); 
684                                 b++;
685                         }
686                 }
687         }
688
689         /* threads */
690         tree->totbuildthread= (re->r.threads > 1 && totface > 10000)? 8: 1;
691         tree->dothreadedbuild= (tree->totbuildthread > 1);
692
693         /* recurse */
694         tree->root= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
695         tree->maxdepth= 1;
696         occ_build_recursive(tree, tree->root, 0, totface, 1);
697
698         if(tree->doindirect) {
699                 occ_build_shade(re, tree);
700                 occ_sum_occlusion(tree, tree->root);
701         }
702         
703         MEM_freeN(tree->co);
704         tree->co= NULL;
705
706         occ_build_sh_normalize(tree->root);
707
708         for(a=0; a<BLENDER_MAX_THREADS; a++)
709                 tree->stack[a]= MEM_callocN(sizeof(OccNode)*TOTCHILD*(tree->maxdepth+1), "OccStack");
710
711         return tree;
712 }
713
714 static void occ_free_tree(OcclusionTree *tree)
715 {
716         int a;
717
718         if(tree) {
719                 if(tree->arena) BLI_memarena_free(tree->arena);
720                 for(a=0; a<BLENDER_MAX_THREADS; a++)
721                         if(tree->stack[a])
722                                 MEM_freeN(tree->stack[a]);
723                 if(tree->occlusion) MEM_freeN(tree->occlusion);
724                 if(tree->cache) MEM_freeN(tree->cache);
725                 if(tree->face) MEM_freeN(tree->face);
726                 if(tree->rad) MEM_freeN(tree->rad);
727                 MEM_freeN(tree);
728         }
729 }
730
731 /* ------------------------- Traversal --------------------------- */
732
733 static float occ_solid_angle(OccNode *node, float *v, float d2, float invd2, float *receivenormal)
734 {
735         float dotreceive, dotemit;
736         float ev[3];
737
738         ev[0]= -v[0]*invd2;
739         ev[1]= -v[1]*invd2;
740         ev[2]= -v[2]*invd2;
741         dotemit= sh_eval(node->sh, ev);
742         dotreceive= INPR(receivenormal, v)*invd2;
743
744         CLAMP(dotemit, 0.0f, 1.0f);
745         CLAMP(dotreceive, 0.0f, 1.0f);
746         
747         return ((node->area*dotemit*dotreceive)/(d2 + node->area*INVPI))*INVPI;
748 }
749
750 static void VecAddDir(float *result, float *v1, float *v2, float fac)
751 {
752         result[0]= v1[0] + fac*(v2[0] - v1[0]);
753         result[1]= v1[1] + fac*(v2[1] - v1[1]);
754         result[2]= v1[2] + fac*(v2[2] - v1[2]);
755 }
756
757 static int occ_visible_quad(float *p, float *n, float *v0, float *v1, float *v2, float *q0, float *q1, float *q2, float *q3)
758 {
759         static const float epsilon = 1e-6f;
760         float c, sd[3];
761         
762         c= INPR(n, p);
763
764         /* signed distances from the vertices to the plane. */
765         sd[0]= INPR(n, v0) - c;
766         sd[1]= INPR(n, v1) - c;
767         sd[2]= INPR(n, v2) - c;
768
769         if(fabsf(sd[0]) < epsilon) sd[0] = 0.0f;
770         if(fabsf(sd[1]) < epsilon) sd[1] = 0.0f;
771         if(fabsf(sd[2]) < epsilon) sd[2] = 0.0f;
772
773         if(sd[0] > 0) {
774                 if(sd[1] > 0) {
775                         if(sd[2] > 0) {
776                                 // +++
777                                 VECCOPY(q0, v0);
778                                 VECCOPY(q1, v1);
779                                 VECCOPY(q2, v2);
780                                 VECCOPY(q3, q2);
781                         }
782                         else if(sd[2] < 0) {
783                                 // ++-
784                                 VECCOPY(q0, v0);
785                                 VECCOPY(q1, v1);
786                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
787                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
788                         }
789                         else {
790                                 // ++0
791                                 VECCOPY(q0, v0);
792                                 VECCOPY(q1, v1);
793                                 VECCOPY(q2, v2);
794                                 VECCOPY(q3, q2);
795                         }
796                 }
797                 else if(sd[1] < 0) {
798                         if(sd[2] > 0) {
799                                 // +-+
800                                 VECCOPY(q0, v0);
801                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
802                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
803                                 VECCOPY(q3, v2);
804                         }
805                         else if(sd[2] < 0) {
806                                 // +--
807                                 VECCOPY(q0, v0);
808                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
809                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
810                                 VECCOPY(q3, q2);
811                         }
812                         else {
813                                 // +-0
814                                 VECCOPY(q0, v0);
815                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
816                                 VECCOPY(q2, v2);
817                                 VECCOPY(q3, q2);
818                         }
819                 }
820                 else {
821                         if(sd[2] > 0) {
822                                 // +0+
823                                 VECCOPY(q0, v0);
824                                 VECCOPY(q1, v1);
825                                 VECCOPY(q2, v2);
826                                 VECCOPY(q3, q2);
827                         }
828                         else if(sd[2] < 0) {
829                                 // +0-
830                                 VECCOPY(q0, v0);
831                                 VECCOPY(q1, v1);
832                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
833                                 VECCOPY(q3, q2);
834                         }
835                         else {
836                                 // +00
837                                 VECCOPY(q0, v0);
838                                 VECCOPY(q1, v1);
839                                 VECCOPY(q2, v2);
840                                 VECCOPY(q3, q2);
841                         }
842                 }
843         }
844         else if(sd[0] < 0) {
845                 if(sd[1] > 0) {
846                         if(sd[2] > 0) {
847                                 // -++
848                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
849                                 VECCOPY(q1, v1);
850                                 VECCOPY(q2, v2);
851                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
852                         }
853                         else if(sd[2] < 0) {
854                                 // -+-
855                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
856                                 VECCOPY(q1, v1);
857                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
858                                 VECCOPY(q3, q2);
859                         }
860                         else {
861                                 // -+0
862                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
863                                 VECCOPY(q1, v1);
864                                 VECCOPY(q2, v2);
865                                 VECCOPY(q3, q2);
866                         }
867                 }
868                 else if(sd[1] < 0) {
869                         if(sd[2] > 0) {
870                                 // --+
871                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
872                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
873                                 VECCOPY(q2, v2);
874                                 VECCOPY(q3, q2);
875                         }
876                         else if(sd[2] < 0) {
877                                 // ---
878                                 return 0;
879                         }
880                         else {
881                                 // --0
882                                 return 0;
883                         }
884                 }
885                 else {
886                         if(sd[2] > 0) {
887                                 // -0+
888                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
889                                 VECCOPY(q1, v1);
890                                 VECCOPY(q2, v2);
891                                 VECCOPY(q3, q2);
892                         }
893                         else if(sd[2] < 0) {
894                                 // -0-
895                                 return 0;
896                         }
897                         else {
898                                 // -00
899                                 return 0;
900                         }
901                 }
902         }
903         else {
904                 if(sd[1] > 0) {
905                         if(sd[2] > 0) {
906                                 // 0++
907                                 VECCOPY(q0, v0);
908                                 VECCOPY(q1, v1);
909                                 VECCOPY(q2, v2);
910                                 VECCOPY(q3, q2);
911                         }
912                         else if(sd[2] < 0) {
913                                 // 0+-
914                                 VECCOPY(q0, v0);
915                                 VECCOPY(q1, v1);
916                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
917                                 VECCOPY(q3, q2);
918                         }
919                         else {
920                                 // 0+0
921                                 VECCOPY(q0, v0);
922                                 VECCOPY(q1, v1);
923                                 VECCOPY(q2, v2);
924                                 VECCOPY(q3, q2);
925                         }
926                 }
927                 else if(sd[1] < 0) {
928                         if(sd[2] > 0) {
929                                 // 0-+
930                                 VECCOPY(q0, v0);
931                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
932                                 VECCOPY(q2, v2);
933                                 VECCOPY(q3, q2);
934                         }
935                         else if(sd[2] < 0) {
936                                 // 0--
937                                 return 0;
938                         }
939                         else {
940                                 // 0-0
941                                 return 0;
942                         }
943                 }
944                 else {
945                         if(sd[2] > 0) {
946                                 // 00+
947                                 VECCOPY(q0, v0);
948                                 VECCOPY(q1, v1);
949                                 VECCOPY(q2, v2);
950                                 VECCOPY(q3, q2);
951                         }
952                         else if(sd[2] < 0) {
953                                 // 00-
954                                 return 0;
955                         }
956                         else {
957                                 // 000
958                                 return 0;
959                         }
960                 }
961         }
962
963         return 1;
964 }
965
966 /* altivec optimization, this works, but is unused */
967
968 #if 0
969 #include <Accelerate/Accelerate.h>
970
971 typedef union {
972         vFloat v;
973         float f[4];
974 } vFloatResult;
975
976 static vFloat vec_splat_float(float val)
977 {
978         return (vFloat){val, val, val, val};
979 }
980
981 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
982 {
983         vFloat vcos, rlen, vrx, vry, vrz, vsrx, vsry, vsrz, gx, gy, gz, vangle;
984         vUInt8 rotate = (vUInt8){4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3};
985         vFloatResult vresult;
986         float result;
987
988         /* compute r* */
989         vrx = (vFloat){q0[0], q1[0], q2[0], q3[0]} - vec_splat_float(p[0]);
990         vry = (vFloat){q0[1], q1[1], q2[1], q3[1]} - vec_splat_float(p[1]);
991         vrz = (vFloat){q0[2], q1[2], q2[2], q3[2]} - vec_splat_float(p[2]);
992
993         /* normalize r* */
994         rlen = vec_rsqrte(vrx*vrx + vry*vry + vrz*vrz + vec_splat_float(1e-16f));
995         vrx = vrx*rlen;
996         vry = vry*rlen;
997         vrz = vrz*rlen;
998
999         /* rotate r* for cross and dot */
1000         vsrx= vec_perm(vrx, vrx, rotate);
1001         vsry= vec_perm(vry, vry, rotate);
1002         vsrz= vec_perm(vrz, vrz, rotate);
1003
1004         /* cross product */
1005         gx = vsry*vrz - vsrz*vry;
1006         gy = vsrz*vrx - vsrx*vrz;
1007         gz = vsrx*vry - vsry*vrx;
1008
1009         /* normalize */
1010         rlen = vec_rsqrte(gx*gx + gy*gy + gz*gz + vec_splat_float(1e-16f));
1011         gx = gx*rlen;
1012         gy = gy*rlen;
1013         gz = gz*rlen;
1014
1015         /* angle */
1016         vcos = vrx*vsrx + vry*vsry + vrz*vsrz;
1017         vcos= vec_max(vec_min(vcos, vec_splat_float(1.0f)), vec_splat_float(-1.0f));
1018         vangle= vacosf(vcos);
1019
1020         /* dot */
1021         vresult.v = (vec_splat_float(n[0])*gx +
1022                      vec_splat_float(n[1])*gy +
1023                      vec_splat_float(n[2])*gz)*vangle;
1024
1025         result= (vresult.f[0] + vresult.f[1] + vresult.f[2] + vresult.f[3])*(0.5f/(float)M_PI);
1026         result= MAX2(result, 0.0f);
1027
1028         return result;
1029 }
1030
1031 #endif
1032
1033 /* SSE optimization, acos code doesn't work */
1034
1035 #if 0
1036
1037 #include <xmmintrin.h>
1038
1039 static __m128 sse_approx_acos(__m128 x)
1040 {
1041         /* needs a better approximation than taylor expansion of acos, since that
1042          * gives big erros for near 1.0 values, sqrt(2*x)*acos(1-x) should work
1043          * better, see http://www.tom.womack.net/projects/sse-fast-arctrig.html */
1044
1045         return _mm_set_ps1(1.0f);
1046 }
1047
1048 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1049 {
1050         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1051         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1052         float fresult[4] __attribute__((aligned(16)));
1053         __m128 qx, qy, qz, rx, ry, rz, rlen, srx, sry, srz, gx, gy, gz, glen, rcos, angle, aresult;
1054
1055         /* compute r */
1056         qx = _mm_set_ps(q3[0], q2[0], q1[0], q0[0]);
1057         qy = _mm_set_ps(q3[1], q2[1], q1[1], q0[1]);
1058         qz = _mm_set_ps(q3[2], q2[2], q1[2], q0[2]);
1059
1060         rx = qx - _mm_set_ps1(p[0]);
1061         ry = qy - _mm_set_ps1(p[1]);
1062         rz = qz - _mm_set_ps1(p[2]);
1063
1064         /* normalize r */
1065         rlen = _mm_rsqrt_ps(rx*rx + ry*ry + rz*rz + _mm_set_ps1(1e-16f));
1066         rx = rx*rlen;
1067         ry = ry*rlen;
1068         rz = rz*rlen;
1069
1070         /* cross product */
1071         srx = _mm_shuffle_ps(rx, rx, _MM_SHUFFLE(0,3,2,1));
1072         sry = _mm_shuffle_ps(ry, ry, _MM_SHUFFLE(0,3,2,1));
1073         srz = _mm_shuffle_ps(rz, rz, _MM_SHUFFLE(0,3,2,1));
1074
1075         gx = sry*rz - srz*ry;
1076         gy = srz*rx - srx*rz;
1077         gz = srx*ry - sry*rx;
1078
1079         /* normalize g */
1080         glen = _mm_rsqrt_ps(gx*gx + gy*gy + gz*gz + _mm_set_ps1(1e-16f));
1081         gx = gx*glen;
1082         gy = gy*glen;
1083         gz = gz*glen;
1084
1085         /* compute angle */
1086         rcos = rx*srx + ry*sry + rz*srz;
1087         rcos= _mm_max_ps(_mm_min_ps(rcos, _mm_set_ps1(1.0f)), _mm_set_ps1(-1.0f));
1088
1089         angle = sse_approx_cos(rcos);
1090         aresult = (_mm_set_ps1(n[0])*gx + _mm_set_ps1(n[1])*gy + _mm_set_ps1(n[2])*gz)*angle;
1091
1092         /* sum together */
1093     result= (fresult[0] + fresult[1] + fresult[2] + fresult[3])*(0.5f/(float)M_PI);
1094         result= MAX2(result, 0.0f);
1095
1096         return result;
1097 }
1098
1099 #endif
1100
1101 static void normalizef(float *n)
1102 {
1103         float d;
1104         
1105         d= INPR(n, n);
1106
1107         if(d > 1.0e-35F) {
1108                 d= 1.0f/sqrtf(d);
1109
1110                 n[0] *= d; 
1111                 n[1] *= d; 
1112                 n[2] *= d;
1113         } 
1114 }
1115
1116 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1117 {
1118         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1119         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1120
1121         VECSUB(r0, q0, p);
1122         VECSUB(r1, q1, p);
1123         VECSUB(r2, q2, p);
1124         VECSUB(r3, q3, p);
1125
1126         normalizef(r0);
1127         normalizef(r1);
1128         normalizef(r2);
1129         normalizef(r3);
1130
1131         cross_v3_v3v3(g0, r1, r0); normalizef(g0);
1132         cross_v3_v3v3(g1, r2, r1); normalizef(g1);
1133         cross_v3_v3v3(g2, r3, r2); normalizef(g2);
1134         cross_v3_v3v3(g3, r0, r3); normalizef(g3);
1135
1136         a1= saacosf(INPR(r0, r1));
1137         a2= saacosf(INPR(r1, r2));
1138         a3= saacosf(INPR(r2, r3));
1139         a4= saacosf(INPR(r3, r0));
1140
1141         dot1= INPR(n, g0);
1142         dot2= INPR(n, g1);
1143         dot3= INPR(n, g2);
1144         dot4= INPR(n, g3);
1145
1146         result= (a1*dot1 + a2*dot2 + a3*dot3 + a4*dot4)*0.5f/(float)M_PI;
1147         result= MAX2(result, 0.0f);
1148
1149         return result;
1150 }
1151
1152 static float occ_form_factor(OccFace *face, float *p, float *n)
1153 {
1154         ObjectInstanceRen *obi;
1155         VlakRen *vlr;
1156         float v1[3], v2[3], v3[3], v4[3], q0[3], q1[3], q2[3], q3[3], contrib= 0.0f;
1157
1158         obi= &R.objectinstance[face->obi];
1159         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
1160
1161         VECCOPY(v1, vlr->v1->co);
1162         VECCOPY(v2, vlr->v2->co);
1163         VECCOPY(v3, vlr->v3->co);
1164
1165         if(obi->flag & R_TRANSFORMED) {
1166                 mul_m4_v3(obi->mat, v1);
1167                 mul_m4_v3(obi->mat, v2);
1168                 mul_m4_v3(obi->mat, v3);
1169         }
1170
1171         if(occ_visible_quad(p, n, v1, v2, v3, q0, q1, q2, q3))
1172                 contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1173
1174         if(vlr->v4) {
1175                 VECCOPY(v4, vlr->v4->co);
1176                 if(obi->flag & R_TRANSFORMED)
1177                         mul_m4_v3(obi->mat, v4);
1178
1179                 if(occ_visible_quad(p, n, v1, v3, v4, q0, q1, q2, q3))
1180                         contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1181         }
1182
1183         return contrib;
1184 }
1185
1186 static void occ_lookup(OcclusionTree *tree, int thread, OccFace *exclude, float *pp, float *pn, float *occ, float rad[3], float bentn[3])
1187 {
1188         OccNode *node, **stack;
1189         OccFace *face;
1190         float resultocc, resultrad[3], v[3], p[3], n[3], co[3], invd2;
1191         float distfac, fac, error, d2, weight, emitarea;
1192         int b, f, totstack;
1193
1194         /* init variables */
1195         VECCOPY(p, pp);
1196         VECCOPY(n, pn);
1197         VECADDFAC(p, p, n, 1e-4f);
1198
1199         if(bentn)
1200                 copy_v3_v3(bentn, n);
1201         
1202         error= tree->error;
1203         distfac= tree->distfac;
1204
1205         resultocc= 0.0f;
1206         zero_v3(resultrad);
1207
1208         /* init stack */
1209         stack= tree->stack[thread];
1210         stack[0]= tree->root;
1211         totstack= 1;
1212
1213         while(totstack) {
1214                 /* pop point off the stack */
1215                 node= stack[--totstack];
1216
1217                 VECSUB(v, node->co, p);
1218                 d2= INPR(v, v) + 1e-16f;
1219                 emitarea= MAX2(node->area, node->dco);
1220
1221                 if(d2*error > emitarea) {
1222                         if(distfac != 0.0f) {
1223                                 fac= 1.0f/(1.0f + distfac*d2);
1224                                 if(fac < 0.01f)
1225                                         continue;
1226                         }
1227                         else
1228                                 fac= 1.0f;
1229
1230                         /* accumulate occlusion from spherical harmonics */
1231                         invd2 = 1.0f/sqrtf(d2);
1232                         weight= occ_solid_angle(node, v, d2, invd2, n);
1233
1234                         if(rad)
1235                                 madd_v3_v3fl(resultrad, node->rad, weight*fac);
1236
1237                         weight *= node->occlusion;
1238
1239                         if(bentn) {
1240                                 bentn[0] -= weight*invd2*v[0];
1241                                 bentn[1] -= weight*invd2*v[1];
1242                                 bentn[2] -= weight*invd2*v[2];
1243                         }
1244
1245                         resultocc += weight*fac;
1246                 }
1247                 else {
1248                         /* traverse into children */
1249                         for(b=0; b<TOTCHILD; b++) {
1250                                 if(node->childflag & (1<<b)) {
1251                                         f= node->child[b].face;
1252                                         face= &tree->face[f];
1253
1254                                         /* accumulate occlusion with face form factor */
1255                                         if(!exclude || !(face->obi == exclude->obi && face->facenr == exclude->facenr)) {
1256                                                 if(bentn || distfac != 0.0f) {
1257                                                         occ_face(face, co, NULL, NULL); 
1258                                                         VECSUB(v, co, p);
1259                                                         d2= INPR(v, v) + 1e-16f;
1260
1261                                                         fac= (distfac == 0.0f)? 1.0f: 1.0f/(1.0f + distfac*d2);
1262                                                         if(fac < 0.01f)
1263                                                                 continue;
1264                                                 }
1265                                                 else
1266                                                         fac= 1.0f;
1267
1268                                                 weight= occ_form_factor(face, p, n);
1269
1270                                                 if(rad)
1271                                                         madd_v3_v3fl(resultrad, tree->rad[f], weight*fac);
1272
1273                                                 weight *= tree->occlusion[f];
1274
1275                                                 if(bentn) {
1276                                                         invd2= 1.0f/sqrtf(d2);
1277                                                         bentn[0] -= weight*invd2*v[0];
1278                                                         bentn[1] -= weight*invd2*v[1];
1279                                                         bentn[2] -= weight*invd2*v[2];
1280                                                 }
1281
1282                                                 resultocc += weight*fac;
1283                                         }
1284                                 }
1285                                 else if(node->child[b].node) {
1286                                         /* push child on the stack */
1287                                         stack[totstack++]= node->child[b].node;
1288                                 }
1289                         }
1290                 }
1291         }
1292
1293         if(occ) *occ= resultocc;
1294         if(rad) copy_v3_v3(rad, resultrad);
1295         /*if(rad && exclude) {
1296                 int a;
1297                 for(a=0; a<tree->totface; a++)
1298                         if((tree->face[a].obi == exclude->obi && tree->face[a].facenr == exclude->facenr))
1299                                 copy_v3_v3(rad, tree->rad[a]);
1300         }*/
1301         if(bentn) normalize_v3(bentn);
1302 }
1303
1304 static void occ_compute_bounces(Render *re, OcclusionTree *tree, int totbounce)
1305 {
1306         float (*rad)[3], (*sum)[3], (*tmp)[3], co[3], n[3], occ;
1307         int bounce, i;
1308
1309         rad= MEM_callocN(sizeof(float)*3*tree->totface, "OcclusionBounceRad");
1310         sum= MEM_dupallocN(tree->rad);
1311
1312         for(bounce=1; bounce<totbounce; bounce++) {
1313                 for(i=0; i<tree->totface; i++) {
1314                         occ_face(&tree->face[i], co, n, NULL);
1315                         madd_v3_v3fl(co, n, 1e-8f);
1316
1317                         occ_lookup(tree, 0, &tree->face[i], co, n, &occ, rad[i], NULL);
1318                         rad[i][0]= MAX2(rad[i][0], 0.0f);
1319                         rad[i][1]= MAX2(rad[i][1], 0.0f);
1320                         rad[i][2]= MAX2(rad[i][2], 0.0f);
1321                         add_v3_v3(sum[i], rad[i]);
1322
1323                         if(re->test_break(re->tbh))
1324                                 break;
1325                 }
1326
1327                 if(re->test_break(re->tbh))
1328                         break;
1329
1330                 tmp= tree->rad;
1331                 tree->rad= rad;
1332                 rad= tmp;
1333
1334                 occ_sum_occlusion(tree, tree->root);
1335         }
1336
1337         MEM_freeN(rad);
1338         MEM_freeN(tree->rad);
1339         tree->rad= sum;
1340
1341         if(!re->test_break(re->tbh))
1342                 occ_sum_occlusion(tree, tree->root);
1343 }
1344
1345 static void occ_compute_passes(Render *re, OcclusionTree *tree, int totpass)
1346 {
1347         float *occ, co[3], n[3];
1348         int pass, i;
1349         
1350         occ= MEM_callocN(sizeof(float)*tree->totface, "OcclusionPassOcc");
1351
1352         for(pass=0; pass<totpass; pass++) {
1353                 for(i=0; i<tree->totface; i++) {
1354                         occ_face(&tree->face[i], co, n, NULL);
1355                         negate_v3(n);
1356                         VECADDFAC(co, co, n, 1e-8f);
1357
1358                         occ_lookup(tree, 0, &tree->face[i], co, n, &occ[i], NULL, NULL);
1359                         if(re->test_break(re->tbh))
1360                                 break;
1361                 }
1362
1363                 if(re->test_break(re->tbh))
1364                         break;
1365
1366                 for(i=0; i<tree->totface; i++) {
1367                         tree->occlusion[i] -= occ[i]; //MAX2(1.0f-occ[i], 0.0f);
1368                         if(tree->occlusion[i] < 0.0f)
1369                                 tree->occlusion[i]= 0.0f;
1370                 }
1371
1372                 occ_sum_occlusion(tree, tree->root);
1373         }
1374
1375         MEM_freeN(occ);
1376 }
1377
1378 static void sample_occ_tree(Render *re, OcclusionTree *tree, OccFace *exclude, float *co, float *n, int thread, int onlyshadow, float *ao, float *indirect)
1379 {
1380         float nn[3], bn[3], fac, occ, occlusion, correction, rad[3];
1381         int aocolor;
1382
1383         aocolor= re->wrld.aocolor;
1384         if(onlyshadow)
1385                 aocolor= WO_AOPLAIN;
1386
1387         VECCOPY(nn, n);
1388         negate_v3(nn);
1389
1390         occ_lookup(tree, thread, exclude, co, nn, &occ, (tree->doindirect)? rad: NULL, (aocolor)? bn: NULL);
1391
1392         correction= re->wrld.ao_approx_correction;
1393
1394         occlusion= (1.0f-correction)*(1.0f-occ);
1395         CLAMP(occlusion, 0.0f, 1.0f);
1396         if(correction != 0.0f)
1397                 occlusion += correction*exp(-occ);
1398
1399         if(aocolor) {
1400                 /* sky shading using bent normal */
1401                 if(ELEM(aocolor, WO_AOSKYCOL, WO_AOSKYTEX)) {
1402                         fac= 0.5*(1.0f+bn[0]*re->grvec[0]+ bn[1]*re->grvec[1]+ bn[2]*re->grvec[2]);
1403                         ao[0]= (1.0f-fac)*re->wrld.horr + fac*re->wrld.zenr;
1404                         ao[1]= (1.0f-fac)*re->wrld.horg + fac*re->wrld.zeng;
1405                         ao[2]= (1.0f-fac)*re->wrld.horb + fac*re->wrld.zenb;
1406                 }
1407 #if 0
1408                 else {  /* WO_AOSKYTEX */
1409                         float dxyview[3];
1410                         bn[0]= -bn[0];
1411                         bn[1]= -bn[1];
1412                         bn[2]= -bn[2];
1413                         dxyview[0]= 1.0f;
1414                         dxyview[1]= 1.0f;
1415                         dxyview[2]= 0.0f;
1416                         shadeSkyView(ao, co, bn, dxyview);
1417                 }
1418 #endif
1419
1420                 mul_v3_fl(ao, occlusion);
1421         }
1422         else {
1423                 ao[0]= occlusion;
1424                 ao[1]= occlusion;
1425                 ao[2]= occlusion;
1426         }
1427
1428         if(tree->doindirect) copy_v3_v3(indirect, rad);
1429         else zero_v3(indirect);
1430 }
1431
1432 /* ---------------------------- Caching ------------------------------- */
1433
1434 static OcclusionCacheSample *find_occ_sample(OcclusionCache *cache, int x, int y)
1435 {
1436         x -= cache->x;
1437         y -= cache->y;
1438
1439         x /= cache->step;
1440         y /= cache->step;
1441         x *= cache->step;
1442         y *= cache->step;
1443
1444         if(x < 0 || x >= cache->w || y < 0 || y >= cache->h)
1445                 return NULL;
1446         else
1447                 return &cache->sample[y*cache->w + x];
1448 }
1449
1450 static int sample_occ_cache(OcclusionTree *tree, float *co, float *n, int x, int y, int thread, float *ao, float *indirect)
1451 {
1452         OcclusionCache *cache;
1453         OcclusionCacheSample *samples[4], *sample;
1454         float wn[4], wz[4], wb[4], tx, ty, w, totw, mino, maxo;
1455         float d[3], dist2;
1456         int i, x1, y1, x2, y2;
1457
1458         if(!tree->cache)
1459                 return 0;
1460         
1461         /* first try to find a sample in the same pixel */
1462         cache= &tree->cache[thread];
1463
1464         if(cache->sample && cache->step) {
1465                 sample= &cache->sample[(y-cache->y)*cache->w + (x-cache->x)];
1466                 if(sample->filled) {
1467                         VECSUB(d, sample->co, co);
1468                         dist2= INPR(d, d);
1469                         if(dist2 < 0.5f*sample->dist2 && INPR(sample->n, n) > 0.98f) {
1470                                 VECCOPY(ao, sample->ao);
1471                                 VECCOPY(indirect, sample->indirect);
1472                                 return 1;
1473                         }
1474                 }
1475         }
1476         else
1477                 return 0;
1478
1479         /* try to interpolate between 4 neighbouring pixels */
1480         samples[0]= find_occ_sample(cache, x, y);
1481         samples[1]= find_occ_sample(cache, x+cache->step, y);
1482         samples[2]= find_occ_sample(cache, x, y+cache->step);
1483         samples[3]= find_occ_sample(cache, x+cache->step, y+cache->step);
1484
1485         for(i=0; i<4; i++)
1486                 if(!samples[i] || !samples[i]->filled)
1487                         return 0;
1488
1489         /* require intensities not being too different */
1490         mino= MIN4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1491         maxo= MAX4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1492
1493         if(maxo - mino > 0.05f)
1494                 return 0;
1495
1496         /* compute weighted interpolation between samples */
1497         zero_v3(ao);
1498         zero_v3(indirect);
1499         totw= 0.0f;
1500
1501         x1= samples[0]->x;
1502         y1= samples[0]->y;
1503         x2= samples[3]->x;
1504         y2= samples[3]->y;
1505
1506         tx= (float)(x2 - x)/(float)(x2 - x1);
1507         ty= (float)(y2 - y)/(float)(y2 - y1);
1508
1509         wb[3]= (1.0f-tx)*(1.0f-ty);
1510         wb[2]= (tx)*(1.0f-ty);
1511         wb[1]= (1.0f-tx)*(ty);
1512         wb[0]= tx*ty;
1513
1514         for(i=0; i<4; i++) {
1515                 VECSUB(d, samples[i]->co, co);
1516                 dist2= INPR(d, d);
1517
1518                 wz[i]= 1.0f; //(samples[i]->dist2/(1e-4f + dist2));
1519                 wn[i]= pow(INPR(samples[i]->n, n), 32.0f);
1520
1521                 w= wb[i]*wn[i]*wz[i];
1522
1523                 totw += w;
1524                 madd_v3_v3fl(ao, samples[i]->ao, w);
1525                 madd_v3_v3fl(indirect, samples[i]->indirect, w);
1526         }
1527
1528         if(totw >= 0.9f) {
1529                 totw= 1.0f/totw;
1530                 mul_v3_fl(ao, totw);
1531                 mul_v3_fl(indirect, totw);
1532                 return 1;
1533         }
1534
1535         return 0;
1536 }
1537
1538 static void sample_occ_surface(ShadeInput *shi)
1539 {
1540         StrandRen *strand= shi->strand;
1541         StrandSurface *mesh= strand->buffer->surface;
1542         int *face, *index = RE_strandren_get_face(shi->obr, strand, 0);
1543         float w[4], *co1, *co2, *co3, *co4;
1544
1545         if(mesh && mesh->face && mesh->co && mesh->ao && index) {
1546                 face= mesh->face[*index];
1547
1548                 co1= mesh->co[face[0]];
1549                 co2= mesh->co[face[1]];
1550                 co3= mesh->co[face[2]];
1551                 co4= (face[3])? mesh->co[face[3]]: NULL;
1552
1553                 interp_weights_face_v3(w, co1, co2, co3, co4, strand->vert->co);
1554
1555                 zero_v3(shi->ao);
1556                 zero_v3(shi->indirect);
1557
1558                 madd_v3_v3fl(shi->ao, mesh->ao[face[0]], w[0]);
1559                 madd_v3_v3fl(shi->indirect, mesh->indirect[face[0]], w[0]);
1560                 madd_v3_v3fl(shi->ao, mesh->ao[face[1]], w[1]);
1561                 madd_v3_v3fl(shi->indirect, mesh->indirect[face[1]], w[1]);
1562                 madd_v3_v3fl(shi->ao, mesh->ao[face[2]], w[2]);
1563                 madd_v3_v3fl(shi->indirect, mesh->indirect[face[2]], w[2]);
1564                 if(face[3]) {
1565                         madd_v3_v3fl(shi->ao, mesh->ao[face[3]], w[3]);
1566                         madd_v3_v3fl(shi->indirect, mesh->indirect[face[3]], w[3]);
1567                 }
1568         }
1569         else {
1570                 shi->ao[0]= 1.0f;
1571                 shi->ao[1]= 1.0f;
1572                 shi->ao[2]= 1.0f;
1573                 zero_v3(shi->indirect);
1574         }
1575 }
1576
1577 /* ------------------------- External Functions --------------------------- */
1578
1579 static void *exec_strandsurface_sample(void *data)
1580 {
1581         OcclusionThread *othread= (OcclusionThread*)data;
1582         Render *re= othread->re;
1583         StrandSurface *mesh= othread->mesh;
1584         float ao[3], indirect[3], co[3], n[3], *co1, *co2, *co3, *co4;
1585         int a, *face;
1586
1587         for(a=othread->begin; a<othread->end; a++) {
1588                 face= mesh->face[a];
1589                 co1= mesh->co[face[0]];
1590                 co2= mesh->co[face[1]];
1591                 co3= mesh->co[face[2]];
1592
1593                 if(face[3]) {
1594                         co4= mesh->co[face[3]];
1595
1596                         interp_v3_v3v3(co, co1, co3, 0.5f);
1597                         normal_quad_v3( n,co1, co2, co3, co4);
1598                 }
1599                 else {
1600                         cent_tri_v3(co, co1, co2, co3);
1601                         normal_tri_v3( n,co1, co2, co3);
1602                 }
1603                 negate_v3(n);
1604
1605                 sample_occ_tree(re, re->occlusiontree, NULL, co, n, othread->thread, 0, ao, indirect);
1606                 VECCOPY(othread->faceao[a], ao);
1607                 VECCOPY(othread->faceindirect[a], indirect);
1608         }
1609
1610         return 0;
1611 }
1612
1613 void make_occ_tree(Render *re)
1614 {
1615         OcclusionThread othreads[BLENDER_MAX_THREADS];
1616         StrandSurface *mesh;
1617         ListBase threads;
1618         float ao[3], indirect[3], (*faceao)[3], (*faceindirect)[3];
1619         int a, totface, totthread, *face, *count;
1620
1621         /* ugly, needed for occ_face */
1622         R= *re;
1623
1624         re->i.infostr= "Occlusion preprocessing";
1625         re->stats_draw(re->sdh, &re->i);
1626         
1627         re->occlusiontree= occ_tree_build(re);
1628         
1629         if(re->occlusiontree) {
1630                 if(re->wrld.ao_approx_passes > 0)
1631                         occ_compute_passes(re, re->occlusiontree, re->wrld.ao_approx_passes);
1632                 if(re->wrld.ao_indirect_bounces > 1)
1633                         occ_compute_bounces(re, re->occlusiontree, re->wrld.ao_indirect_bounces);
1634
1635                 for(mesh=re->strandsurface.first; mesh; mesh=mesh->next) {
1636                         if(!mesh->face || !mesh->co || !mesh->ao)
1637                                 continue;
1638
1639                         count= MEM_callocN(sizeof(int)*mesh->totvert, "OcclusionCount");
1640                         faceao= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceAO");
1641                         faceindirect= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceIndirect");
1642
1643                         totthread= (mesh->totface > 10000)? re->r.threads: 1;
1644                         totface= mesh->totface/totthread;
1645                         for(a=0; a<totthread; a++) {
1646                                 othreads[a].re= re;
1647                                 othreads[a].faceao= faceao;
1648                                 othreads[a].faceindirect= faceindirect;
1649                                 othreads[a].thread= a;
1650                                 othreads[a].mesh= mesh;
1651                                 othreads[a].begin= a*totface;
1652                                 othreads[a].end= (a == totthread-1)? mesh->totface: (a+1)*totface;
1653                         }
1654
1655                         if(totthread == 1) {
1656                                 exec_strandsurface_sample(&othreads[0]);
1657                         }
1658                         else {
1659                                 BLI_init_threads(&threads, exec_strandsurface_sample, totthread);
1660
1661                                 for(a=0; a<totthread; a++)
1662                                         BLI_insert_thread(&threads, &othreads[a]);
1663
1664                                 BLI_end_threads(&threads);
1665                         }
1666
1667                         for(a=0; a<mesh->totface; a++) {
1668                                 face= mesh->face[a];
1669
1670                                 VECCOPY(ao, faceao[a]);
1671                                 VECCOPY(indirect, faceindirect[a]);
1672
1673                                 VECADD(mesh->ao[face[0]], mesh->ao[face[0]], ao);
1674                                 VECADD(mesh->indirect[face[0]], mesh->indirect[face[0]], indirect);
1675                                 count[face[0]]++;
1676                                 VECADD(mesh->ao[face[1]], mesh->ao[face[1]], ao);
1677                                 VECADD(mesh->indirect[face[1]], mesh->indirect[face[1]], indirect);
1678                                 count[face[1]]++;
1679                                 VECADD(mesh->ao[face[2]], mesh->ao[face[2]], ao);
1680                                 VECADD(mesh->indirect[face[2]], mesh->indirect[face[2]], indirect);
1681                                 count[face[2]]++;
1682
1683                                 if(face[3]) {
1684                                         VECADD(mesh->ao[face[3]], mesh->ao[face[3]], ao);
1685                                         VECADD(mesh->indirect[face[3]], mesh->indirect[face[3]], indirect);
1686                                         count[face[3]]++;
1687                                 }
1688                         }
1689
1690                         for(a=0; a<mesh->totvert; a++) {
1691                                 if(count[a]) {
1692                                         mul_v3_fl(mesh->ao[a], 1.0f/count[a]);
1693                                         mul_v3_fl(mesh->indirect[a], 1.0f/count[a]);
1694                                 }
1695                         }
1696
1697                         MEM_freeN(count);
1698                         MEM_freeN(faceao);
1699                         MEM_freeN(faceindirect);
1700                 }
1701         }
1702 }
1703
1704 void free_occ(Render *re)
1705 {
1706         if(re->occlusiontree) {
1707                 occ_free_tree(re->occlusiontree);
1708                 re->occlusiontree = NULL;
1709         }
1710 }
1711
1712 void sample_occ(Render *re, ShadeInput *shi)
1713 {
1714         OcclusionTree *tree= re->occlusiontree;
1715         OcclusionCache *cache;
1716         OcclusionCacheSample *sample;
1717         OccFace exclude;
1718         int onlyshadow;
1719
1720         if(tree) {
1721                 if(shi->strand) {
1722                         sample_occ_surface(shi);
1723                 }
1724                 /* try to get result from the cache if possible */
1725                 else if(shi->depth!=0 || !sample_occ_cache(tree, shi->co, shi->vno, shi->xs, shi->ys, shi->thread, shi->ao, shi->indirect)) {
1726                         /* no luck, let's sample the occlusion */
1727                         exclude.obi= shi->obi - re->objectinstance;
1728                         exclude.facenr= shi->vlr->index;
1729                         onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1730                         sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao, shi->indirect);
1731
1732                         /* fill result into sample, each time */
1733                         if(tree->cache) {
1734                                 cache= &tree->cache[shi->thread];
1735
1736                                 if(cache->sample && cache->step) {
1737                                         sample= &cache->sample[(shi->ys-cache->y)*cache->w + (shi->xs-cache->x)];
1738                                         VECCOPY(sample->co, shi->co);
1739                                         VECCOPY(sample->n, shi->vno);
1740                                         VECCOPY(sample->ao, shi->ao);
1741                                         VECCOPY(sample->indirect, shi->indirect);
1742                                         sample->intensity= MAX3(sample->ao[0], sample->ao[1], sample->ao[2]);
1743                                         sample->intensity= MAX2(sample->intensity, MAX3(sample->indirect[0], sample->indirect[1], sample->indirect[2]));
1744                                         sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1745                                         sample->filled= 1;
1746                                 }
1747                         }
1748                 }
1749         }
1750         else {
1751                 shi->ao[0]= 1.0f;
1752                 shi->ao[1]= 1.0f;
1753                 shi->ao[2]= 1.0f;
1754
1755                 shi->indirect[0]= 0.0f;
1756                 shi->indirect[1]= 0.0f;
1757                 shi->indirect[2]= 0.0f;
1758         }
1759 }
1760
1761 void cache_occ_samples(Render *re, RenderPart *pa, ShadeSample *ssamp)
1762 {
1763         OcclusionTree *tree= re->occlusiontree;
1764         PixStr ps;
1765         OcclusionCache *cache;
1766         OcclusionCacheSample *sample;
1767         OccFace exclude;
1768         ShadeInput *shi;
1769         intptr_t *rd=NULL;
1770         int *ro=NULL, *rp=NULL, *rz=NULL, onlyshadow;
1771         int x, y, step = CACHE_STEP;
1772
1773         if(!tree->cache)
1774                 return;
1775
1776         cache= &tree->cache[pa->thread];
1777         cache->w= pa->rectx;
1778         cache->h= pa->recty;
1779         cache->x= pa->disprect.xmin;
1780         cache->y= pa->disprect.ymin;
1781         cache->step= step;
1782         cache->sample= MEM_callocN(sizeof(OcclusionCacheSample)*cache->w*cache->h, "OcclusionCacheSample");
1783         sample= cache->sample;
1784
1785         if(re->osa) {
1786                 rd= pa->rectdaps;
1787         }
1788         else {
1789                 /* fake pixel struct for non-osa */
1790                 ps.next= NULL;
1791                 ps.mask= 0xFFFF;
1792
1793                 ro= pa->recto;
1794                 rp= pa->rectp;
1795                 rz= pa->rectz;
1796         }
1797
1798         /* compute a sample at every step pixels */
1799         for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++) {
1800                 for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, sample++, rd++, ro++, rp++, rz++) {
1801                         if(!(((x - pa->disprect.xmin + step) % step) == 0 || x == pa->disprect.xmax-1))
1802                                 continue;
1803                         if(!(((y - pa->disprect.ymin + step) % step) == 0 || y == pa->disprect.ymax-1))
1804                                 continue;
1805
1806                         if(re->osa) {
1807                                 if(!*rd) continue;
1808
1809                                 shade_samples_fill_with_ps(ssamp, (PixStr *)(*rd), x, y);
1810                         }
1811                         else {
1812                                 if(!*rp) continue;
1813
1814                                 ps.obi= *ro;
1815                                 ps.facenr= *rp;
1816                                 ps.z= *rz;
1817                                 shade_samples_fill_with_ps(ssamp, &ps, x, y);
1818                         }
1819
1820                         shi= ssamp->shi;
1821                         if(shi->vlr) {
1822                                 onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1823                                 exclude.obi= shi->obi - re->objectinstance;
1824                                 exclude.facenr= shi->vlr->index;
1825                                 sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao, shi->indirect);
1826
1827                                 VECCOPY(sample->co, shi->co);
1828                                 VECCOPY(sample->n, shi->vno);
1829                                 VECCOPY(sample->ao, shi->ao);
1830                                 VECCOPY(sample->indirect, shi->indirect);
1831                                 sample->intensity= MAX3(sample->ao[0], sample->ao[1], sample->ao[2]);
1832                                 sample->intensity= MAX2(sample->intensity, MAX3(sample->indirect[0], sample->indirect[1], sample->indirect[2]));
1833                                 sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1834                                 sample->x= shi->xs;
1835                                 sample->y= shi->ys;
1836                                 sample->filled= 1;
1837                         }
1838
1839                         if(re->test_break(re->tbh))
1840                                 break;
1841                 }
1842         }
1843 }
1844
1845 void free_occ_samples(Render *re, RenderPart *pa)
1846 {
1847         OcclusionTree *tree= re->occlusiontree;
1848         OcclusionCache *cache;
1849
1850         if(tree->cache) {
1851                 cache= &tree->cache[pa->thread];
1852
1853                 if(cache->sample)
1854                         MEM_freeN(cache->sample);
1855
1856                 cache->w= 0;
1857                 cache->h= 0;
1858                 cache->step= 0;
1859         }
1860 }
1861