Bugfix for unnecessary missing object warnings for speed vectors,
[blender.git] / source / blender / render / intern / source / occlusion.c
1 /* 
2  * $Id$
3  *
4  * ***** BEGIN GPL LICENSE BLOCK *****
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software Foundation,
18  * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19  *
20  * The Original Code is Copyright (C) 2008 Blender Foundation.
21  * All rights reserved.
22  *
23  * The Original Code is: all of this file.
24  *
25  * Contributor(s): Brecht Van Lommel.
26  *
27  * ***** END GPL LICENSE BLOCK *****
28  */
29
30 #include <math.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "MEM_guardedalloc.h"
36
37 #include "DNA_material_types.h"
38
39 #include "BLI_arithb.h"
40 #include "BLI_blenlib.h"
41 #include "BLI_memarena.h"
42 #include "BLI_threads.h"
43
44 #include "BKE_global.h"
45 #include "BKE_scene.h"
46 #include "BKE_utildefines.h"
47
48 #include "RE_shader_ext.h"
49
50 /* local includes */
51 #include "occlusion.h"
52 #include "render_types.h"
53 #include "rendercore.h"
54 #include "renderdatabase.h"
55 #include "pixelshading.h"
56 #include "shading.h"
57 #include "zbuf.h"
58
59 /* ------------------------- Declarations --------------------------- */
60
61 #define INVALID_INDEX ((int)(~0))
62 #define INVPI 0.31830988618379069f
63 #define TOTCHILD 8
64 #define CACHE_STEP 3
65
66 typedef struct OcclusionCacheSample {
67         float co[3], n[3], col[3], intensity, dist2;
68         int x, y, filled;
69 } OcclusionCacheSample;
70
71 typedef struct OcclusionCache {
72         OcclusionCacheSample *sample;
73         int x, y, w, h, step;
74 } OcclusionCache;
75
76 typedef struct OccFace {
77         int obi;
78         int facenr;
79 } OccFace;
80
81 typedef struct OccNode {
82         float co[3], area;
83         float sh[9], dco;
84         float occlusion;
85         int childflag;
86         union {
87                 //OccFace face;
88                 int face;
89                 struct OccNode *node;
90         } child[TOTCHILD];
91 } OccNode;
92
93 typedef struct OcclusionTree {
94         MemArena *arena;
95
96         float (*co)[3];         /* temporary during build */
97
98         OccFace *face;          /* instance and face indices */
99         float *occlusion;       /* occlusion for faces */
100         
101         OccNode *root;
102
103         OccNode **stack[BLENDER_MAX_THREADS];
104         int maxdepth;
105
106         int totface;
107
108         float error;
109         float distfac;
110
111         int dothreadedbuild;
112         int totbuildthread;
113
114         OcclusionCache *cache;
115 } OcclusionTree;
116
117 typedef struct OcclusionThread {
118         Render *re;
119         StrandSurface *mesh;
120         float (*facecol)[3];
121         int begin, end;
122         int thread;
123 } OcclusionThread;
124
125 typedef struct OcclusionBuildThread {
126         OcclusionTree *tree;
127         int begin, end, depth;
128         OccNode *node;
129 } OcclusionBuildThread;
130
131 /* ------------------------- Shading --------------------------- */
132
133 extern Render R; // meh
134
135 #if 0
136 static void occ_shade(ShadeSample *ssamp, ObjectInstanceRen *obi, VlakRen *vlr, float *rad)
137 {
138         ShadeInput *shi= ssamp->shi;
139         ShadeResult *shr= ssamp->shr;
140         float l, u, v, *v1, *v2, *v3;
141         
142         /* init */
143         if(vlr->v4) {
144                 shi->u= u= 0.5f;
145                 shi->v= v= 0.5f;
146         }
147         else {
148                 shi->u= u= 1.0f/3.0f;
149                 shi->v= v= 1.0f/3.0f;
150         }
151
152         /* setup render coordinates */
153         v1= vlr->v1->co;
154         v2= vlr->v2->co;
155         v3= vlr->v3->co;
156         
157         /* renderco */
158         l= 1.0f-u-v;
159         
160         shi->co[0]= l*v3[0]+u*v1[0]+v*v2[0];
161         shi->co[1]= l*v3[1]+u*v1[1]+v*v2[1];
162         shi->co[2]= l*v3[2]+u*v1[2]+v*v2[2];
163         
164         shade_input_set_triangle_i(shi, obi, vlr, 0, 1, 2);
165
166         /* set up view vector */
167         VECCOPY(shi->view, shi->co);
168         Normalize(shi->view);
169         
170         /* cache for shadow */
171         shi->samplenr++;
172         
173         shi->xs= 0; // TODO
174         shi->ys= 0;
175         
176         shade_input_set_normals(shi);
177
178         /* no normal flip */
179         if(shi->flippednor)
180                 shade_input_flip_normals(shi);
181
182         /* not a pretty solution, but fixes common cases */
183         if(shi->obr->ob && shi->obr->ob->transflag & OB_NEG_SCALE) {
184                 VecMulf(shi->vn, -1.0f);
185                 VecMulf(shi->vno, -1.0f);
186         }
187
188         /* init material vars */
189         // note, keep this synced with render_types.h
190         memcpy(&shi->r, &shi->mat->r, 23*sizeof(float));
191         shi->har= shi->mat->har;
192         
193         /* render */
194         shade_input_set_shade_texco(shi);
195         shade_material_loop(shi, shr); /* todo: nodes */
196         
197         VECCOPY(rad, shr->combined);
198 }
199
200 static void occ_build_shade(Render *re, OcclusionTree *tree)
201 {
202         ShadeSample ssamp;
203         ObjectInstanceRen *obi;
204         VlakRen *vlr;
205         int a;
206
207         R= *re;
208
209         /* setup shade sample with correct passes */
210         memset(&ssamp, 0, sizeof(ShadeSample));
211         ssamp.shi[0].lay= re->scene->lay;
212         ssamp.shi[0].passflag= SCE_PASS_DIFFUSE|SCE_PASS_RGBA;
213         ssamp.shi[0].combinedflag= ~(SCE_PASS_SPEC);
214         ssamp.tot= 1;
215
216         for(a=0; a<tree->totface; a++) {
217                 obi= &R.objectinstance[tree->face[a].obi];
218                 vlr= RE_findOrAddVlak(obi->obr, tree->face[a].vlr);
219
220                 occ_shade(&ssamp, obi, vlr, tree->rad[a]);
221         }
222 }
223 #endif
224
225 /* ------------------------- Spherical Harmonics --------------------------- */
226
227 /* Use 2nd order SH => 9 coefficients, stored in this order:
228    0 = (0,0),
229    1 = (1,-1), 2 = (1,0), 3 = (1,1),
230    4 = (2,-2), 5 = (2,-1), 6 = (2,0), 7 = (2,1), 8 = (2,2) */
231
232 static void sh_copy(float *shresult, float *sh)
233 {
234         memcpy(shresult, sh, sizeof(float)*9);
235 }
236
237 static void sh_mul(float *sh, float f)
238 {
239         int i;
240
241         for(i=0; i<9; i++)
242                 sh[i] *= f;
243 }
244
245 static void sh_add(float *shresult, float *sh1, float *sh2)
246 {
247         int i;
248
249         for(i=0; i<9; i++)
250                 shresult[i]= sh1[i] + sh2[i];
251 }
252
253 static void sh_from_disc(float *n, float area, float *shresult)
254 {
255         /* See formula (3) in:
256            "An Efficient Representation for Irradiance Environment Maps" */
257         float sh[9], x, y, z;
258
259         x= n[0];
260         y= n[1];
261         z= n[2];
262
263         sh[0]= 0.282095f;
264
265         sh[1]= 0.488603f*y;
266         sh[2]= 0.488603f*z;
267         sh[3]= 0.488603f*x;
268         
269         sh[4]= 1.092548f*x*y;
270         sh[5]= 1.092548f*y*z;
271         sh[6]= 0.315392f*(3.0f*z*z - 1.0f);
272         sh[7]= 1.092548f*x*z;
273         sh[8]= 0.546274f*(x*x - y*y);
274
275         sh_mul(sh, area);
276         sh_copy(shresult, sh);
277 }
278
279 static float sh_eval(float *sh, float *v)
280 {
281         /* See formula (13) in:
282            "An Efficient Representation for Irradiance Environment Maps" */
283         static const float c1 = 0.429043f, c2 = 0.511664f, c3 = 0.743125f;
284         static const float c4 = 0.886227f, c5 = 0.247708f;
285         float x, y, z, sum;
286
287         x= v[0];
288         y= v[1];
289         z= v[2];
290
291         sum= c1*sh[8]*(x*x - y*y);
292         sum += c3*sh[6]*z*z;
293         sum += c4*sh[0];
294         sum += -c5*sh[6];
295         sum += 2.0f*c1*(sh[4]*x*y + sh[7]*x*z + sh[5]*y*z);
296         sum += 2.0f*c2*(sh[3]*x + sh[1]*y + sh[2]*z);
297
298         return sum;
299 }
300
301 /* ------------------------------ Building --------------------------------- */
302
303 static void occ_face(const OccFace *face, float *co, float *normal, float *area)
304 {
305         ObjectInstanceRen *obi;
306         VlakRen *vlr;
307         float v1[3], v2[3], v3[3], v4[3];
308
309         obi= &R.objectinstance[face->obi];
310         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
311         
312         if(co) {
313                 if(vlr->v4)
314                         VecLerpf(co, vlr->v1->co, vlr->v3->co, 0.5f);
315                 else
316                         CalcCent3f(co, vlr->v1->co, vlr->v2->co, vlr->v3->co);
317
318                 if(obi->flag & R_TRANSFORMED)
319                         Mat4MulVecfl(obi->mat, co);
320         }
321         
322         if(normal) {
323                 normal[0]= -vlr->n[0];
324                 normal[1]= -vlr->n[1];
325                 normal[2]= -vlr->n[2];
326
327                 if(obi->flag & R_TRANSFORMED) {
328                         float xn, yn, zn, (*imat)[3]= obi->imat;
329
330                         xn= normal[0];
331                         yn= normal[1];
332                         zn= normal[2];
333
334                         normal[0]= imat[0][0]*xn + imat[0][1]*yn + imat[0][2]*zn;
335                         normal[1]= imat[1][0]*xn + imat[1][1]*yn + imat[1][2]*zn;
336                         normal[2]= imat[2][0]*xn + imat[2][1]*yn + imat[2][2]*zn;
337                 }
338         }
339
340         if(area) {
341                 VECCOPY(v1, vlr->v1->co);
342                 VECCOPY(v2, vlr->v2->co);
343                 VECCOPY(v3, vlr->v3->co);
344                 if(vlr->v4) VECCOPY(v4, vlr->v4->co);
345
346                 if(obi->flag & R_TRANSFORMED) {
347                         Mat4MulVecfl(obi->mat, v1);
348                         Mat4MulVecfl(obi->mat, v2);
349                         Mat4MulVecfl(obi->mat, v3);
350                         if(vlr->v4) Mat4MulVecfl(obi->mat, v4);
351                 }
352
353                 /* todo: correct area for instances */
354                 if(vlr->v4)
355                         *area= AreaQ3Dfl(v1, v2, v3, v4);
356                 else
357                         *area= AreaT3Dfl(v1, v2, v3);
358         }
359 }
360
361 static void occ_sum_occlusion(OcclusionTree *tree, OccNode *node)
362 {
363         OccNode *child;
364         float occ, area, totarea;
365         int a, b;
366
367         occ= 0.0f;
368         totarea= 0.0f;
369
370         for(b=0; b<TOTCHILD; b++) {
371                 if(node->childflag & (1<<b)) {
372                         a= node->child[b].face;
373                         occ_face(&tree->face[a], 0, 0, &area);
374                         occ += area*tree->occlusion[a];
375                         totarea += area;
376                 }
377                 else if(node->child[b].node) {
378                         child= node->child[b].node;
379                         occ_sum_occlusion(tree, child);
380
381                         occ += child->area*child->occlusion;
382                         totarea += child->area;
383                 }
384         }
385
386         if(totarea != 0.0f)
387                 occ /= totarea;
388         
389         node->occlusion= occ;
390 }
391
392 static int occ_find_bbox_axis(OcclusionTree *tree, int begin, int end, float *min, float *max)
393 {
394         float len, maxlen= -1.0f;
395         int a, axis = 0;
396
397         INIT_MINMAX(min, max);
398
399         for(a=begin; a<end; a++)
400                 DO_MINMAX(tree->co[a], min, max)
401
402         for(a=0; a<3; a++) {
403                 len= max[a] - min[a];
404
405                 if(len > maxlen) {
406                         maxlen= len;
407                         axis= a;
408                 }
409         }
410
411         return axis;
412 }
413
414 void occ_node_from_face(OccFace *face, OccNode *node)
415 {
416         float n[3];
417
418         occ_face(face, node->co, n, &node->area);
419         node->dco= 0.0f;
420         sh_from_disc(n, node->area, node->sh);
421 }
422
423 static void occ_build_dco(OcclusionTree *tree, OccNode *node, float *co, float *dco)
424 {
425         OccNode *child;
426         float dist, d[3], nco[3];
427         int b;
428
429         for(b=0; b<TOTCHILD; b++) {
430                 if(node->childflag & (1<<b)) {
431                         occ_face(tree->face+node->child[b].face, nco, 0, 0);
432                 }
433                 else if(node->child[b].node) {
434                         child= node->child[b].node;
435                         occ_build_dco(tree, child, co, dco);
436                         VECCOPY(nco, child->co);
437                 }
438
439                 VECSUB(d, nco, co);
440                 dist= INPR(d, d);
441                 if(dist > *dco)
442                         *dco= dist;
443         }
444 }
445
446 static void occ_build_split(OcclusionTree *tree, int begin, int end, int *split)
447 {
448         float min[3], max[3], mid;
449         int axis, a, enda;
450
451         /* split in middle of boundbox. this seems faster than median split
452          * on complex scenes, possibly since it avoids two distant faces to
453          * be in the same node better? */
454         axis= occ_find_bbox_axis(tree, begin, end, min, max);
455         mid= 0.5f*(min[axis]+max[axis]);
456
457         a= begin;
458         enda= end;
459         while(a<enda) {
460                 if(tree->co[a][axis] > mid) {
461                         enda--;
462                         SWAP(OccFace, tree->face[a], tree->face[enda]);
463                         SWAP(float, tree->co[a][0], tree->co[enda][0]);
464                         SWAP(float, tree->co[a][1], tree->co[enda][1]);
465                         SWAP(float, tree->co[a][2], tree->co[enda][2]);
466                 }
467                 else
468                         a++;
469         }
470
471         *split= enda;
472 }
473
474 static void occ_build_8_split(OcclusionTree *tree, int begin, int end, int *offset, int *count)
475 {
476         /* split faces into eight groups */
477         int b, splitx, splity[2], splitz[4];
478
479         occ_build_split(tree, begin, end, &splitx);
480
481         /* force split if none found, to deal with degenerate geometry */
482         if(splitx == begin || splitx == end)
483                 splitx= (begin+end)/2;
484
485         occ_build_split(tree, begin, splitx, &splity[0]);
486         occ_build_split(tree, splitx, end, &splity[1]);
487
488         occ_build_split(tree, begin, splity[0], &splitz[0]);
489         occ_build_split(tree, splity[0], splitx, &splitz[1]);
490         occ_build_split(tree, splitx, splity[1], &splitz[2]);
491         occ_build_split(tree, splity[1], end, &splitz[3]);
492
493         offset[0]= begin;
494         offset[1]= splitz[0];
495         offset[2]= splity[0];
496         offset[3]= splitz[1];
497         offset[4]= splitx;
498         offset[5]= splitz[2];
499         offset[6]= splity[1];
500         offset[7]= splitz[3];
501
502         for(b=0; b<7; b++)
503                 count[b]= offset[b+1] - offset[b];
504         count[7]= end - offset[7];
505 }
506
507 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth);
508
509 static void *exec_occ_build(void *data)
510 {
511         OcclusionBuildThread *othread= (OcclusionBuildThread*)data;
512
513         occ_build_recursive(othread->tree, othread->node, othread->begin, othread->end, othread->depth);
514
515         return 0;
516 }
517
518 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth)
519 {
520         ListBase threads;
521         OcclusionBuildThread othreads[BLENDER_MAX_THREADS];
522         OccNode *child, tmpnode;
523         OccFace *face;
524         int a, b, totthread=0, offset[TOTCHILD], count[TOTCHILD];
525
526         /* keep track of maximum depth for stack */
527         if(depth > tree->maxdepth)
528                 tree->maxdepth= depth;
529
530         /* add a new node */
531         node->occlusion= 1.0f;
532
533         /* leaf node with only children */
534         if(end - begin <= TOTCHILD) {
535                 for(a=begin, b=0; a<end; a++, b++) {
536                         face= &tree->face[a];
537                         node->child[b].face= a;
538                         node->childflag |= (1<<b);
539                 }
540         }
541         else {
542                 /* order faces */
543                 occ_build_8_split(tree, begin, end, offset, count);
544
545                 if(depth == 1 && tree->dothreadedbuild)
546                         BLI_init_threads(&threads, exec_occ_build, tree->totbuildthread);
547
548                 for(b=0; b<TOTCHILD; b++) {
549                         if(count[b] == 0) {
550                                 node->child[b].node= NULL;
551                         }
552                         else if(count[b] == 1) {
553                                 face= &tree->face[offset[b]];
554                                 node->child[b].face= offset[b];
555                                 node->childflag |= (1<<b);
556                         }
557                         else {
558                                 if(tree->dothreadedbuild)
559                                         BLI_lock_thread(LOCK_CUSTOM1);
560
561                                 child= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
562                                 node->child[b].node= child;
563
564                                 if(tree->dothreadedbuild)
565                                         BLI_unlock_thread(LOCK_CUSTOM1);
566
567                                 if(depth == 1 && tree->dothreadedbuild) {
568                                         othreads[totthread].tree= tree;
569                                         othreads[totthread].node= child;
570                                         othreads[totthread].begin= offset[b];
571                                         othreads[totthread].end= offset[b]+count[b];
572                                         othreads[totthread].depth= depth+1;
573                                         BLI_insert_thread(&threads, &othreads[totthread]);
574                                         totthread++;
575                                 }
576                                 else
577                                         occ_build_recursive(tree, child, offset[b], offset[b]+count[b], depth+1);
578                         }
579                 }
580
581                 if(depth == 1 && tree->dothreadedbuild)
582                         BLI_end_threads(&threads);
583         }
584
585         /* combine area, position and sh */
586         for(b=0; b<TOTCHILD; b++) {
587                 if(node->childflag & (1<<b)) {
588                         child= &tmpnode;
589                         occ_node_from_face(tree->face+node->child[b].face, &tmpnode);
590                 }
591                 else {
592                         child= node->child[b].node;
593                 }
594
595                 if(child) {
596                         node->area += child->area;
597                         sh_add(node->sh, node->sh, child->sh);
598                         VECADDFAC(node->co, node->co, child->co, child->area);
599                 }
600         }
601
602         if(node->area != 0.0f)
603                 VecMulf(node->co, 1.0f/node->area);
604
605         /* compute maximum distance from center */
606         node->dco= 0.0f;
607         occ_build_dco(tree, node, node->co, &node->dco);
608 }
609
610 static void occ_build_sh_normalize(OccNode *node)
611 {
612         /* normalize spherical harmonics to not include area, so
613          * we can clamp the dot product and then mutliply by area */
614         int b;
615
616         if(node->area != 0.0f)
617                 sh_mul(node->sh, 1.0f/node->area);
618
619         for(b=0; b<TOTCHILD; b++) {
620                 if(node->childflag & (1<<b));
621                 else if(node->child[b].node)
622                         occ_build_sh_normalize(node->child[b].node);
623         }
624 }
625
626 static OcclusionTree *occ_tree_build(Render *re)
627 {
628         OcclusionTree *tree;
629         ObjectInstanceRen *obi;
630         ObjectRen *obr;
631         VlakRen *vlr= NULL;
632         int a, b, c, totface;
633
634         /* count */
635         totface= 0;
636         for(obi=re->instancetable.first; obi; obi=obi->next) {
637                 obr= obi->obr;
638                 for(a=0; a<obr->totvlak; a++) {
639                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
640                         else vlr++;
641
642                         if(vlr->mat->mode & MA_TRACEBLE)
643                                 totface++;
644                 }
645         }
646
647         if(totface == 0)
648                 return NULL;
649         
650         tree= MEM_callocN(sizeof(OcclusionTree), "OcclusionTree");
651         tree->totface= totface;
652
653         /* parameters */
654         tree->error= get_render_aosss_error(&re->r, re->wrld.ao_approx_error);
655         tree->distfac= (re->wrld.aomode & WO_AODIST)? re->wrld.aodistfac: 0.0f;
656
657         /* allocation */
658         tree->arena= BLI_memarena_new(0x8000 * sizeof(OccNode));
659         BLI_memarena_use_calloc(tree->arena);
660
661         if(re->wrld.aomode & WO_AOCACHE)
662                 tree->cache= MEM_callocN(sizeof(OcclusionCache)*BLENDER_MAX_THREADS, "OcclusionCache");
663
664         tree->face= MEM_callocN(sizeof(OccFace)*totface, "OcclusionFace");
665         tree->co= MEM_callocN(sizeof(float)*3*totface, "OcclusionCo");
666         tree->occlusion= MEM_callocN(sizeof(float)*totface, "OcclusionOcclusion");
667
668         /* make array of face pointers */
669         for(b=0, c=0, obi=re->instancetable.first; obi; obi=obi->next, c++) {
670                 obr= obi->obr;
671                 for(a=0; a<obr->totvlak; a++) {
672                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
673                         else vlr++;
674
675                         if(vlr->mat->mode & MA_TRACEBLE) {
676                                 tree->face[b].obi= c;
677                                 tree->face[b].facenr= a;
678                                 tree->occlusion[b]= 1.0f;
679                                 occ_face(&tree->face[b], tree->co[b], NULL, NULL); 
680                                 b++;
681                         }
682                 }
683         }
684
685         /* threads */
686         tree->totbuildthread= (re->r.threads > 1 && totface > 10000)? 8: 1;
687         tree->dothreadedbuild= (tree->totbuildthread > 1);
688
689         /* recurse */
690         tree->root= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
691         occ_build_recursive(tree, tree->root, 0, totface, 1);
692
693 #if 0
694         if(tree->doindirect) {
695                 occ_build_shade(re, tree);
696                 occ_sum_occlusion(tree, tree->root);
697         }
698 #endif
699         
700         MEM_freeN(tree->co);
701         tree->co= NULL;
702
703         occ_build_sh_normalize(tree->root);
704
705         for(a=0; a<BLENDER_MAX_THREADS; a++)
706                 tree->stack[a]= MEM_callocN(sizeof(OccNode)*TOTCHILD*(tree->maxdepth+1), "OccStack");
707
708         return tree;
709 }
710
711 static void occ_free_tree(OcclusionTree *tree)
712 {
713         int a;
714
715         if(tree) {
716                 if(tree->arena) BLI_memarena_free(tree->arena);
717                 for(a=0; a<BLENDER_MAX_THREADS; a++)
718                         if(tree->stack[a])
719                                 MEM_freeN(tree->stack[a]);
720                 if(tree->occlusion) MEM_freeN(tree->occlusion);
721                 if(tree->face) MEM_freeN(tree->face);
722                 if(tree->cache) MEM_freeN(tree->cache);
723                 MEM_freeN(tree);
724         }
725 }
726
727 /* ------------------------- Traversal --------------------------- */
728
729 static float occ_solid_angle(OccNode *node, float *v, float d2, float invd2, float *receivenormal)
730 {
731         float dotreceive, dotemit;
732         float ev[3];
733
734         ev[0]= -v[0]*invd2;
735         ev[1]= -v[1]*invd2;
736         ev[2]= -v[2]*invd2;
737         dotemit= sh_eval(node->sh, ev);
738         dotreceive= INPR(receivenormal, v)*invd2;
739
740         CLAMP(dotemit, 0.0f, 1.0f);
741         CLAMP(dotreceive, 0.0f, 1.0f);
742         
743         return ((node->area*dotemit*dotreceive)/(d2 + node->area*INVPI))*INVPI;
744 }
745
746 static void VecAddDir(float *result, float *v1, float *v2, float fac)
747 {
748         result[0]= v1[0] + fac*(v2[0] - v1[0]);
749         result[1]= v1[1] + fac*(v2[1] - v1[1]);
750         result[2]= v1[2] + fac*(v2[2] - v1[2]);
751 }
752
753 static int occ_visible_quad(float *p, float *n, float *v0, float *v1, float *v2, float *q0, float *q1, float *q2, float *q3)
754 {
755         static const float epsilon = 1e-6f;
756         float c, sd[3];
757         
758         c= INPR(n, p);
759
760         /* signed distances from the vertices to the plane. */
761         sd[0]= INPR(n, v0) - c;
762         sd[1]= INPR(n, v1) - c;
763         sd[2]= INPR(n, v2) - c;
764
765         if(fabs(sd[0]) < epsilon) sd[0] = 0.0f;
766         if(fabs(sd[1]) < epsilon) sd[1] = 0.0f;
767         if(fabs(sd[2]) < epsilon) sd[2] = 0.0f;
768
769         if(sd[0] > 0) {
770                 if(sd[1] > 0) {
771                         if(sd[2] > 0) {
772                                 // +++
773                                 VECCOPY(q0, v0);
774                                 VECCOPY(q1, v1);
775                                 VECCOPY(q2, v2);
776                                 VECCOPY(q3, q2);
777                         }
778                         else if(sd[2] < 0) {
779                                 // ++-
780                                 VECCOPY(q0, v0);
781                                 VECCOPY(q1, v1);
782                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
783                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
784                         }
785                         else {
786                                 // ++0
787                                 VECCOPY(q0, v0);
788                                 VECCOPY(q1, v1);
789                                 VECCOPY(q2, v2);
790                                 VECCOPY(q3, q2);
791                         }
792                 }
793                 else if(sd[1] < 0) {
794                         if(sd[2] > 0) {
795                                 // +-+
796                                 VECCOPY(q0, v0);
797                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
798                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
799                                 VECCOPY(q3, v2);
800                         }
801                         else if(sd[2] < 0) {
802                                 // +--
803                                 VECCOPY(q0, v0);
804                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
805                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
806                                 VECCOPY(q3, q2);
807                         }
808                         else {
809                                 // +-0
810                                 VECCOPY(q0, v0);
811                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
812                                 VECCOPY(q2, v2);
813                                 VECCOPY(q3, q2);
814                         }
815                 }
816                 else {
817                         if(sd[2] > 0) {
818                                 // +0+
819                                 VECCOPY(q0, v0);
820                                 VECCOPY(q1, v1);
821                                 VECCOPY(q2, v2);
822                                 VECCOPY(q3, q2);
823                         }
824                         else if(sd[2] < 0) {
825                                 // +0-
826                                 VECCOPY(q0, v0);
827                                 VECCOPY(q1, v1);
828                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
829                                 VECCOPY(q3, q2);
830                         }
831                         else {
832                                 // +00
833                                 VECCOPY(q0, v0);
834                                 VECCOPY(q1, v1);
835                                 VECCOPY(q2, v2);
836                                 VECCOPY(q3, q2);
837                         }
838                 }
839         }
840         else if(sd[0] < 0) {
841                 if(sd[1] > 0) {
842                         if(sd[2] > 0) {
843                                 // -++
844                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
845                                 VECCOPY(q1, v1);
846                                 VECCOPY(q2, v2);
847                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
848                         }
849                         else if(sd[2] < 0) {
850                                 // -+-
851                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
852                                 VECCOPY(q1, v1);
853                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
854                                 VECCOPY(q3, q2);
855                         }
856                         else {
857                                 // -+0
858                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
859                                 VECCOPY(q1, v1);
860                                 VECCOPY(q2, v2);
861                                 VECCOPY(q3, q2);
862                         }
863                 }
864                 else if(sd[1] < 0) {
865                         if(sd[2] > 0) {
866                                 // --+
867                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
868                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
869                                 VECCOPY(q2, v2);
870                                 VECCOPY(q3, q2);
871                         }
872                         else if(sd[2] < 0) {
873                                 // ---
874                                 return 0;
875                         }
876                         else {
877                                 // --0
878                                 return 0;
879                         }
880                 }
881                 else {
882                         if(sd[2] > 0) {
883                                 // -0+
884                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
885                                 VECCOPY(q1, v1);
886                                 VECCOPY(q2, v2);
887                                 VECCOPY(q3, q2);
888                         }
889                         else if(sd[2] < 0) {
890                                 // -0-
891                                 return 0;
892                         }
893                         else {
894                                 // -00
895                                 return 0;
896                         }
897                 }
898         }
899         else {
900                 if(sd[1] > 0) {
901                         if(sd[2] > 0) {
902                                 // 0++
903                                 VECCOPY(q0, v0);
904                                 VECCOPY(q1, v1);
905                                 VECCOPY(q2, v2);
906                                 VECCOPY(q3, q2);
907                         }
908                         else if(sd[2] < 0) {
909                                 // 0+-
910                                 VECCOPY(q0, v0);
911                                 VECCOPY(q1, v1);
912                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
913                                 VECCOPY(q3, q2);
914                         }
915                         else {
916                                 // 0+0
917                                 VECCOPY(q0, v0);
918                                 VECCOPY(q1, v1);
919                                 VECCOPY(q2, v2);
920                                 VECCOPY(q3, q2);
921                         }
922                 }
923                 else if(sd[1] < 0) {
924                         if(sd[2] > 0) {
925                                 // 0-+
926                                 VECCOPY(q0, v0);
927                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
928                                 VECCOPY(q2, v2);
929                                 VECCOPY(q3, q2);
930                         }
931                         else if(sd[2] < 0) {
932                                 // 0--
933                                 return 0;
934                         }
935                         else {
936                                 // 0-0
937                                 return 0;
938                         }
939                 }
940                 else {
941                         if(sd[2] > 0) {
942                                 // 00+
943                                 VECCOPY(q0, v0);
944                                 VECCOPY(q1, v1);
945                                 VECCOPY(q2, v2);
946                                 VECCOPY(q3, q2);
947                         }
948                         else if(sd[2] < 0) {
949                                 // 00-
950                                 return 0;
951                         }
952                         else {
953                                 // 000
954                                 return 0;
955                         }
956                 }
957         }
958
959         return 1;
960 }
961
962 /* altivec optimization, this works, but is unused */
963
964 #if 0
965 #include <Accelerate/Accelerate.h>
966
967 typedef union {
968         vFloat v;
969         float f[4];
970 } vFloatResult;
971
972 static vFloat vec_splat_float(float val)
973 {
974         return (vFloat){val, val, val, val};
975 }
976
977 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
978 {
979         vFloat vcos, rlen, vrx, vry, vrz, vsrx, vsry, vsrz, gx, gy, gz, vangle;
980         vUInt8 rotate = (vUInt8){4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3};
981         vFloatResult vresult;
982         float result;
983
984         /* compute r* */
985         vrx = (vFloat){q0[0], q1[0], q2[0], q3[0]} - vec_splat_float(p[0]);
986         vry = (vFloat){q0[1], q1[1], q2[1], q3[1]} - vec_splat_float(p[1]);
987         vrz = (vFloat){q0[2], q1[2], q2[2], q3[2]} - vec_splat_float(p[2]);
988
989         /* normalize r* */
990         rlen = vec_rsqrte(vrx*vrx + vry*vry + vrz*vrz + vec_splat_float(1e-16f));
991         vrx = vrx*rlen;
992         vry = vry*rlen;
993         vrz = vrz*rlen;
994
995         /* rotate r* for cross and dot */
996         vsrx= vec_perm(vrx, vrx, rotate);
997         vsry= vec_perm(vry, vry, rotate);
998         vsrz= vec_perm(vrz, vrz, rotate);
999
1000         /* cross product */
1001         gx = vsry*vrz - vsrz*vry;
1002         gy = vsrz*vrx - vsrx*vrz;
1003         gz = vsrx*vry - vsry*vrx;
1004
1005         /* normalize */
1006         rlen = vec_rsqrte(gx*gx + gy*gy + gz*gz + vec_splat_float(1e-16f));
1007         gx = gx*rlen;
1008         gy = gy*rlen;
1009         gz = gz*rlen;
1010
1011         /* angle */
1012         vcos = vrx*vsrx + vry*vsry + vrz*vsrz;
1013         vcos= vec_max(vec_min(vcos, vec_splat_float(1.0f)), vec_splat_float(-1.0f));
1014         vangle= vacosf(vcos);
1015
1016         /* dot */
1017         vresult.v = (vec_splat_float(n[0])*gx +
1018                      vec_splat_float(n[1])*gy +
1019                      vec_splat_float(n[2])*gz)*vangle;
1020
1021         result= (vresult.f[0] + vresult.f[1] + vresult.f[2] + vresult.f[3])*(0.5f/(float)M_PI);
1022         result= MAX2(result, 0.0f);
1023
1024         return result;
1025 }
1026
1027 #endif
1028
1029 /* SSE optimization, acos code doesn't work */
1030
1031 #if 0
1032
1033 #include <xmmintrin.h>
1034
1035 static __m128 sse_approx_acos(__m128 x)
1036 {
1037         /* needs a better approximation than taylor expansion of acos, since that
1038          * gives big erros for near 1.0 values, sqrt(2*x)*acos(1-x) should work
1039          * better, see http://www.tom.womack.net/projects/sse-fast-arctrig.html */
1040
1041         return _mm_set_ps1(1.0f);
1042 }
1043
1044 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1045 {
1046         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1047         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1048         float fresult[4] __attribute__((aligned(16)));
1049         __m128 qx, qy, qz, rx, ry, rz, rlen, srx, sry, srz, gx, gy, gz, glen, rcos, angle, aresult;
1050
1051         /* compute r */
1052         qx = _mm_set_ps(q3[0], q2[0], q1[0], q0[0]);
1053         qy = _mm_set_ps(q3[1], q2[1], q1[1], q0[1]);
1054         qz = _mm_set_ps(q3[2], q2[2], q1[2], q0[2]);
1055
1056         rx = qx - _mm_set_ps1(p[0]);
1057         ry = qy - _mm_set_ps1(p[1]);
1058         rz = qz - _mm_set_ps1(p[2]);
1059
1060         /* normalize r */
1061         rlen = _mm_rsqrt_ps(rx*rx + ry*ry + rz*rz + _mm_set_ps1(1e-16f));
1062         rx = rx*rlen;
1063         ry = ry*rlen;
1064         rz = rz*rlen;
1065
1066         /* cross product */
1067         srx = _mm_shuffle_ps(rx, rx, _MM_SHUFFLE(0,3,2,1));
1068         sry = _mm_shuffle_ps(ry, ry, _MM_SHUFFLE(0,3,2,1));
1069         srz = _mm_shuffle_ps(rz, rz, _MM_SHUFFLE(0,3,2,1));
1070
1071         gx = sry*rz - srz*ry;
1072         gy = srz*rx - srx*rz;
1073         gz = srx*ry - sry*rx;
1074
1075         /* normalize g */
1076         glen = _mm_rsqrt_ps(gx*gx + gy*gy + gz*gz + _mm_set_ps1(1e-16f));
1077         gx = gx*glen;
1078         gy = gy*glen;
1079         gz = gz*glen;
1080
1081         /* compute angle */
1082         rcos = rx*srx + ry*sry + rz*srz;
1083         rcos= _mm_max_ps(_mm_min_ps(rcos, _mm_set_ps1(1.0f)), _mm_set_ps1(-1.0f));
1084
1085         angle = sse_approx_cos(rcos);
1086         aresult = (_mm_set_ps1(n[0])*gx + _mm_set_ps1(n[1])*gy + _mm_set_ps1(n[2])*gz)*angle;
1087
1088         /* sum together */
1089     result= (fresult[0] + fresult[1] + fresult[2] + fresult[3])*(0.5f/(float)M_PI);
1090         result= MAX2(result, 0.0f);
1091
1092         return result;
1093 }
1094
1095 #endif
1096
1097 static float saacosf(float fac)
1098 {
1099         if(fac<= -1.0f) return (float)M_PI;
1100         else if(fac>=1.0f) return 0.0f;
1101         else return acos(fac); /* acosf(fac) */
1102 }
1103
1104 static void normalizef(float *n)
1105 {
1106         float d;
1107         
1108         d= INPR(n, n);
1109
1110         if(d > 1.0e-35F) {
1111                 d= 1.0f/sqrt(d); /* sqrtf(d) */
1112
1113                 n[0] *= d; 
1114                 n[1] *= d; 
1115                 n[2] *= d;
1116         } 
1117 }
1118
1119 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1120 {
1121         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1122         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1123
1124         VECSUB(r0, q0, p);
1125         VECSUB(r1, q1, p);
1126         VECSUB(r2, q2, p);
1127         VECSUB(r3, q3, p);
1128
1129         normalizef(r0);
1130         normalizef(r1);
1131         normalizef(r2);
1132         normalizef(r3);
1133
1134         Crossf(g0, r1, r0); normalizef(g0);
1135         Crossf(g1, r2, r1); normalizef(g1);
1136         Crossf(g2, r3, r2); normalizef(g2);
1137         Crossf(g3, r0, r3); normalizef(g3);
1138
1139         a1= saacosf(INPR(r0, r1));
1140         a2= saacosf(INPR(r1, r2));
1141         a3= saacosf(INPR(r2, r3));
1142         a4= saacosf(INPR(r3, r0));
1143
1144         dot1= INPR(n, g0);
1145         dot2= INPR(n, g1);
1146         dot3= INPR(n, g2);
1147         dot4= INPR(n, g3);
1148
1149         result= (a1*dot1 + a2*dot2 + a3*dot3 + a4*dot4)*0.5f/(float)M_PI;
1150         result= MAX2(result, 0.0f);
1151
1152         return result;
1153 }
1154
1155 float occ_form_factor(OccFace *face, float *p, float *n)
1156 {
1157         ObjectInstanceRen *obi;
1158         VlakRen *vlr;
1159         float v1[3], v2[3], v3[3], v4[3], q0[3], q1[3], q2[3], q3[3], contrib= 0.0f;
1160
1161         obi= &R.objectinstance[face->obi];
1162         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
1163
1164         VECCOPY(v1, vlr->v1->co);
1165         VECCOPY(v2, vlr->v2->co);
1166         VECCOPY(v3, vlr->v3->co);
1167
1168         if(obi->flag & R_TRANSFORMED) {
1169                 Mat4MulVecfl(obi->mat, v1);
1170                 Mat4MulVecfl(obi->mat, v2);
1171                 Mat4MulVecfl(obi->mat, v3);
1172         }
1173
1174         if(occ_visible_quad(p, n, v1, v2, v3, q0, q1, q2, q3))
1175                 contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1176
1177         if(vlr->v4) {
1178                 VECCOPY(v4, vlr->v4->co);
1179                 if(obi->flag & R_TRANSFORMED)
1180                         Mat4MulVecfl(obi->mat, v4);
1181
1182                 if(occ_visible_quad(p, n, v1, v3, v4, q0, q1, q2, q3))
1183                         contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1184         }
1185
1186         return contrib;
1187 }
1188
1189 static void occ_lookup(OcclusionTree *tree, int thread, OccFace *exclude, float *pp, float *pn, float *occ, float *bentn)
1190 {
1191         OccNode *node, **stack;
1192         OccFace *face;
1193         float resultocc, v[3], p[3], n[3], co[3], invd2;
1194         float distfac, fac, error, d2, weight, emitarea;
1195         int b, totstack;
1196
1197         /* init variables */
1198         VECCOPY(p, pp);
1199         VECCOPY(n, pn);
1200         VECADDFAC(p, p, n, 1e-4f);
1201
1202         if(bentn)
1203                 VECCOPY(bentn, n);
1204         
1205         error= tree->error;
1206         distfac= tree->distfac;
1207
1208         resultocc= 0.0f;
1209
1210         /* init stack */
1211         stack= tree->stack[thread];
1212         stack[0]= tree->root;
1213         totstack= 1;
1214
1215         while(totstack) {
1216                 /* pop point off the stack */
1217                 node= stack[--totstack];
1218
1219                 VECSUB(v, node->co, p);
1220                 d2= INPR(v, v) + 1e-16f;
1221                 emitarea= MAX2(node->area, node->dco);
1222
1223                 if(d2*error > emitarea) {
1224                         if(distfac != 0.0f) {
1225                                 fac= 1.0f/(1.0f + distfac*d2);
1226                                 if(fac < 0.01f)
1227                                         continue;
1228                         }
1229                         else
1230                                 fac= 1.0f;
1231
1232                         /* accumulate occlusion from spherical harmonics */
1233                         invd2 = 1.0f/sqrt(d2);
1234                         weight= occ_solid_angle(node, v, d2, invd2, n);
1235                         weight *= node->occlusion;
1236
1237                         if(bentn) {
1238                                 bentn[0] -= weight*invd2*v[0];
1239                                 bentn[1] -= weight*invd2*v[1];
1240                                 bentn[2] -= weight*invd2*v[2];
1241                         }
1242
1243                         resultocc += weight*fac;
1244                 }
1245                 else {
1246                         /* traverse into children */
1247                         for(b=0; b<TOTCHILD; b++) {
1248                                 if(node->childflag & (1<<b)) {
1249                                         face= tree->face+node->child[b].face;
1250
1251                                         /* accumulate occlusion with face form factor */
1252                                         if(!exclude || !(face->obi == exclude->obi && face->facenr == exclude->facenr)) {
1253                                                 if(bentn || distfac != 0.0f) {
1254                                                         occ_face(face, co, NULL, NULL); 
1255                                                         VECSUB(v, co, p);
1256                                                         d2= INPR(v, v) + 1e-16f;
1257
1258                                                         fac= (distfac == 0.0f)? 1.0f: 1.0f/(1.0f + distfac*d2);
1259                                                         if(fac < 0.01f)
1260                                                                 continue;
1261                                                 }
1262                                                 else
1263                                                         fac= 1.0f;
1264
1265                                                 weight= occ_form_factor(face, p, n);
1266                                                 weight *= tree->occlusion[node->child[b].face];
1267
1268                                                 if(bentn) {
1269                                                         invd2= 1.0f/sqrt(d2);
1270                                                         bentn[0] -= weight*invd2*v[0];
1271                                                         bentn[1] -= weight*invd2*v[1];
1272                                                         bentn[2] -= weight*invd2*v[2];
1273                                                 }
1274
1275                                                 resultocc += weight*fac;
1276                                         }
1277                                 }
1278                                 else if(node->child[b].node) {
1279                                         /* push child on the stack */
1280                                         stack[totstack++]= node->child[b].node;
1281                                 }
1282                         }
1283                 }
1284         }
1285
1286         if(occ) *occ= resultocc;
1287 }
1288
1289 static void occ_compute_passes(Render *re, OcclusionTree *tree, int totpass)
1290 {
1291         float *occ, co[3], n[3];
1292         int pass, i;
1293         
1294         occ= MEM_callocN(sizeof(float)*tree->totface, "OcclusionPassOcc");
1295
1296         for(pass=0; pass<totpass; pass++) {
1297                 for(i=0; i<tree->totface; i++) {
1298                         occ_face(&tree->face[i], co, n, NULL);
1299                         VecMulf(n, -1.0f);
1300                         VECADDFAC(co, co, n, 1e-8f);
1301
1302                         occ_lookup(tree, 0, &tree->face[i], co, n, &occ[i], NULL);
1303                         if(re->test_break())
1304                                 break;
1305                 }
1306
1307                 if(re->test_break())
1308                         break;
1309
1310                 for(i=0; i<tree->totface; i++) {
1311                         tree->occlusion[i] -= occ[i]; //MAX2(1.0f-occ[i], 0.0f);
1312                         if(tree->occlusion[i] < 0.0f)
1313                                 tree->occlusion[i]= 0.0f;
1314                 }
1315
1316                 occ_sum_occlusion(tree, tree->root);
1317         }
1318
1319         MEM_freeN(occ);
1320 }
1321
1322 static void sample_occ_tree(Render *re, OcclusionTree *tree, OccFace *exclude, float *co, float *n, int thread, int onlyshadow, float *skycol)
1323 {
1324         float nn[3], bn[3], dxyview[3], fac, occ, occlusion, correction;
1325         int aocolor;
1326
1327         aocolor= re->wrld.aocolor;
1328         if(onlyshadow)
1329                 aocolor= WO_AOPLAIN;
1330
1331         VECCOPY(nn, n);
1332         VecMulf(nn, -1.0f);
1333
1334         occ_lookup(tree, thread, exclude, co, nn, &occ, (aocolor)? bn: NULL);
1335
1336         correction= re->wrld.ao_approx_correction;
1337
1338         occlusion= (1.0f-correction)*(1.0f-occ);
1339         CLAMP(occlusion, 0.0f, 1.0f);
1340         if(correction != 0.0f)
1341                 occlusion += correction*exp(-occ);
1342
1343         if(aocolor) {
1344                 /* sky shading using bent normal */
1345                 if(aocolor==WO_AOSKYCOL) {
1346                         fac= 0.5*(1.0f+bn[0]*re->grvec[0]+ bn[1]*re->grvec[1]+ bn[2]*re->grvec[2]);
1347                         skycol[0]= (1.0f-fac)*re->wrld.horr + fac*re->wrld.zenr;
1348                         skycol[1]= (1.0f-fac)*re->wrld.horg + fac*re->wrld.zeng;
1349                         skycol[2]= (1.0f-fac)*re->wrld.horb + fac*re->wrld.zenb;
1350                 }
1351                 else {  /* WO_AOSKYTEX */
1352                         dxyview[0]= 1.0f;
1353                         dxyview[1]= 1.0f;
1354                         dxyview[2]= 0.0f;
1355                         shadeSkyView(skycol, co, bn, dxyview);
1356                 }
1357
1358                 VecMulf(skycol, occlusion);
1359         }
1360         else {
1361                 skycol[0]= occlusion;
1362                 skycol[1]= occlusion;
1363                 skycol[2]= occlusion;
1364         }
1365 }
1366
1367 /* ---------------------------- Caching ------------------------------- */
1368
1369 static OcclusionCacheSample *find_occ_sample(OcclusionCache *cache, int x, int y)
1370 {
1371         x -= cache->x;
1372         y -= cache->y;
1373
1374         x /= cache->step;
1375         y /= cache->step;
1376         x *= cache->step;
1377         y *= cache->step;
1378
1379         if(x < 0 || x >= cache->w || y < 0 || y >= cache->h)
1380                 return NULL;
1381         else
1382                 return &cache->sample[y*cache->w + x];
1383 }
1384
1385 static int sample_occ_cache(OcclusionTree *tree, float *co, float *n, int x, int y, int thread, float *col)
1386 {
1387         OcclusionCache *cache;
1388         OcclusionCacheSample *samples[4], *sample;
1389         float wn[4], wz[4], wb[4], tx, ty, w, totw, mino, maxo;
1390         float d[3], dist2;
1391         int i, x1, y1, x2, y2;
1392
1393         if(!tree->cache)
1394                 return 0;
1395         
1396         /* first try to find a sample in the same pixel */
1397         cache= &tree->cache[thread];
1398
1399         if(cache->sample && cache->step) {
1400                 sample= &cache->sample[(y-cache->y)*cache->w + (x-cache->x)];
1401                 if(sample->filled) {
1402                         VECSUB(d, sample->co, co);
1403                         dist2= INPR(d, d);
1404                         if(dist2 < 0.5f*sample->dist2 && INPR(sample->n, n) > 0.98f) {
1405                                 VECCOPY(col, sample->col);
1406                                 return 1;
1407                         }
1408                 }
1409         }
1410         else
1411                 return 0;
1412
1413         /* try to interpolate between 4 neighbouring pixels */
1414         samples[0]= find_occ_sample(cache, x, y);
1415         samples[1]= find_occ_sample(cache, x+cache->step, y);
1416         samples[2]= find_occ_sample(cache, x, y+cache->step);
1417         samples[3]= find_occ_sample(cache, x+cache->step, y+cache->step);
1418
1419         for(i=0; i<4; i++)
1420                 if(!samples[i] || !samples[i]->filled)
1421                         return 0;
1422
1423         /* require intensities not being too different */
1424         mino= MIN4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1425         maxo= MAX4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1426
1427         if(maxo - mino > 0.05f)
1428                 return 0;
1429
1430         /* compute weighted interpolation between samples */
1431         col[0]= col[1]= col[2]= 0.0f;
1432         totw= 0.0f;
1433
1434         x1= samples[0]->x;
1435         y1= samples[0]->y;
1436         x2= samples[3]->x;
1437         y2= samples[3]->y;
1438
1439         tx= (float)(x2 - x)/(float)(x2 - x1);
1440         ty= (float)(y2 - y)/(float)(y2 - y1);
1441
1442         wb[3]= (1.0f-tx)*(1.0f-ty);
1443         wb[2]= (tx)*(1.0f-ty);
1444         wb[1]= (1.0f-tx)*(ty);
1445         wb[0]= tx*ty;
1446
1447         for(i=0; i<4; i++) {
1448                 VECSUB(d, samples[i]->co, co);
1449                 dist2= INPR(d, d);
1450
1451                 wz[i]= 1.0f; //(samples[i]->dist2/(1e-4f + dist2));
1452                 wn[i]= pow(INPR(samples[i]->n, n), 32.0f);
1453
1454                 w= wb[i]*wn[i]*wz[i];
1455
1456                 totw += w;
1457                 col[0] += w*samples[i]->col[0];
1458                 col[1] += w*samples[i]->col[1];
1459                 col[2] += w*samples[i]->col[2];
1460         }
1461
1462         if(totw >= 0.9f) {
1463                 totw= 1.0f/totw;
1464                 col[0] *= totw;
1465                 col[1] *= totw;
1466                 col[2] *= totw;
1467                 return 1;
1468         }
1469
1470         return 0;
1471 }
1472
1473 static void sample_occ_surface(ShadeInput *shi)
1474 {
1475         StrandRen *strand= shi->strand;
1476         StrandSurface *mesh= strand->buffer->surface;
1477         int *face, *index = RE_strandren_get_face(shi->obr, strand, 0);
1478         float w[4], *co1, *co2, *co3, *co4;
1479
1480         if(mesh && index) {
1481                 face= mesh->face[*index];
1482
1483                 co1= mesh->co[face[0]];
1484                 co2= mesh->co[face[1]];
1485                 co3= mesh->co[face[2]];
1486                 co4= (face[3])? mesh->co[face[3]]: NULL;
1487
1488                 InterpWeightsQ3Dfl(co1, co2, co3, co4, strand->vert->co, w);
1489
1490                 shi->ao[0]= shi->ao[1]= shi->ao[2]= 0.0f;
1491                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[0]], w[0]);
1492                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[1]], w[1]);
1493                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[2]], w[2]);
1494                 if(face[3])
1495                         VECADDFAC(shi->ao, shi->ao, mesh->col[face[3]], w[3]);
1496         }
1497         else {
1498                 shi->ao[0]= 1.0f;
1499                 shi->ao[1]= 1.0f;
1500                 shi->ao[2]= 1.0f;
1501         }
1502 }
1503
1504 /* ------------------------- External Functions --------------------------- */
1505
1506 static void *exec_strandsurface_sample(void *data)
1507 {
1508         OcclusionThread *othread= (OcclusionThread*)data;
1509         Render *re= othread->re;
1510         StrandSurface *mesh= othread->mesh;
1511         float col[3], co[3], n[3], *co1, *co2, *co3, *co4;
1512         int a, *face;
1513
1514         for(a=othread->begin; a<othread->end; a++) {
1515                 face= mesh->face[a];
1516                 co1= mesh->co[face[0]];
1517                 co2= mesh->co[face[1]];
1518                 co3= mesh->co[face[2]];
1519
1520                 if(face[3]) {
1521                         co4= mesh->co[face[3]];
1522
1523                         VecLerpf(co, co1, co3, 0.5f);
1524                         CalcNormFloat4(co1, co2, co3, co4, n);
1525                 }
1526                 else {
1527                         CalcCent3f(co, co1, co2, co3);
1528                         CalcNormFloat(co1, co2, co3, n);
1529                 }
1530                 VecMulf(n, -1.0f);
1531
1532                 sample_occ_tree(re, re->occlusiontree, NULL, co, n, othread->thread, 0, col);
1533                 VECCOPY(othread->facecol[a], col);
1534         }
1535
1536         return 0;
1537 }
1538
1539 void make_occ_tree(Render *re)
1540 {
1541         OcclusionThread othreads[BLENDER_MAX_THREADS];
1542         StrandSurface *mesh;
1543         ListBase threads;
1544         float col[3], (*facecol)[3];
1545         int a, totface, totthread, *face, *count;
1546
1547         /* ugly, needed for occ_face */
1548         R= *re;
1549
1550         re->i.infostr= "Occlusion preprocessing";
1551         re->stats_draw(&re->i);
1552         
1553         re->occlusiontree= occ_tree_build(re);
1554         
1555         if(re->occlusiontree) {
1556                 if(re->wrld.ao_approx_passes)
1557                         occ_compute_passes(re, re->occlusiontree, re->wrld.ao_approx_passes);
1558
1559                 for(mesh=re->strandsurface.first; mesh; mesh=mesh->next) {
1560                         count= MEM_callocN(sizeof(int)*mesh->totvert, "OcclusionCount");
1561                         facecol= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceCol");
1562
1563                         totthread= (mesh->totface > 10000)? re->r.threads: 1;
1564                         totface= mesh->totface/totthread;
1565                         for(a=0; a<totthread; a++) {
1566                                 othreads[a].re= re;
1567                                 othreads[a].facecol= facecol;
1568                                 othreads[a].thread= a;
1569                                 othreads[a].mesh= mesh;
1570                                 othreads[a].begin= a*totface;
1571                                 othreads[a].end= (a == totthread-1)? mesh->totface: (a+1)*totface;
1572                         }
1573
1574                         if(totthread == 1) {
1575                                 exec_strandsurface_sample(&othreads[0]);
1576                         }
1577                         else {
1578                                 BLI_init_threads(&threads, exec_strandsurface_sample, totthread);
1579
1580                                 for(a=0; a<totthread; a++)
1581                                         BLI_insert_thread(&threads, &othreads[a]);
1582
1583                                 BLI_end_threads(&threads);
1584                         }
1585
1586                         for(a=0; a<mesh->totface; a++) {
1587                                 face= mesh->face[a];
1588
1589                                 VECCOPY(col, facecol[a]);
1590                                 VECADD(mesh->col[face[0]], mesh->col[face[0]], col);
1591                                 count[face[0]]++;
1592                                 VECADD(mesh->col[face[1]], mesh->col[face[1]], col);
1593                                 count[face[1]]++;
1594                                 VECADD(mesh->col[face[2]], mesh->col[face[2]], col);
1595                                 count[face[2]]++;
1596
1597                                 if(face[3]) {
1598                                         VECADD(mesh->col[face[3]], mesh->col[face[3]], col);
1599                                         count[face[3]]++;
1600                                 }
1601                         }
1602
1603                         for(a=0; a<mesh->totvert; a++)
1604                                 if(count[a])
1605                                         VecMulf(mesh->col[a], 1.0f/count[a]);
1606
1607                         MEM_freeN(count);
1608                         MEM_freeN(facecol);
1609                 }
1610         }
1611 }
1612
1613 void free_occ(Render *re)
1614 {
1615         if(re->occlusiontree) {
1616                 occ_free_tree(re->occlusiontree);
1617                 re->occlusiontree = NULL;
1618         }
1619 }
1620
1621 void sample_occ(Render *re, ShadeInput *shi)
1622 {
1623         OcclusionTree *tree= re->occlusiontree;
1624         OcclusionCache *cache;
1625         OcclusionCacheSample *sample;
1626         OccFace exclude;
1627         int onlyshadow;
1628
1629         if(tree) {
1630                 if(shi->strand) {
1631                         sample_occ_surface(shi);
1632                 }
1633                 /* try to get result from the cache if possible */
1634                 else if(!sample_occ_cache(tree, shi->co, shi->vno, shi->xs, shi->ys, shi->thread, shi->ao)) {
1635                         /* no luck, let's sample the occlusion */
1636                         exclude.obi= shi->obi - re->objectinstance;
1637                         exclude.facenr= shi->vlr->index;
1638                         onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1639                         sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1640
1641                         /* fill result into sample, each time */
1642                         if(tree->cache) {
1643                                 cache= &tree->cache[shi->thread];
1644
1645                                 if(cache->sample && cache->step) {
1646                                         sample= &cache->sample[(shi->ys-cache->y)*cache->w + (shi->xs-cache->x)];
1647                                         VECCOPY(sample->co, shi->co);
1648                                         VECCOPY(sample->n, shi->vno);
1649                                         VECCOPY(sample->col, shi->ao);
1650                                         sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1651                                         sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1652                                         sample->filled= 1;
1653                                 }
1654                         }
1655                 }
1656         }
1657         else {
1658                 shi->ao[0]= 1.0f;
1659                 shi->ao[1]= 1.0f;
1660                 shi->ao[2]= 1.0f;
1661         }
1662 }
1663
1664 void cache_occ_samples(Render *re, RenderPart *pa, ShadeSample *ssamp)
1665 {
1666         OcclusionTree *tree= re->occlusiontree;
1667         PixStr ps;
1668         OcclusionCache *cache;
1669         OcclusionCacheSample *sample;
1670         OccFace exclude;
1671         ShadeInput *shi;
1672         long *rd=NULL;
1673         int *ro=NULL, *rp=NULL, *rz=NULL, onlyshadow;
1674         int x, y, step = CACHE_STEP;
1675
1676         if(!tree->cache)
1677                 return;
1678
1679         cache= &tree->cache[pa->thread];
1680         cache->w= pa->rectx;
1681         cache->h= pa->recty;
1682         cache->x= pa->disprect.xmin;
1683         cache->y= pa->disprect.ymin;
1684         cache->step= step;
1685         cache->sample= MEM_callocN(sizeof(OcclusionCacheSample)*cache->w*cache->h, "OcclusionCacheSample");
1686         sample= cache->sample;
1687
1688         if(re->osa) {
1689                 rd= pa->rectdaps;
1690         }
1691         else {
1692                 /* fake pixel struct for non-osa */
1693                 ps.next= NULL;
1694                 ps.mask= 0xFFFF;
1695
1696                 ro= pa->recto;
1697                 rp= pa->rectp;
1698                 rz= pa->rectz;
1699         }
1700
1701         /* compute a sample at every step pixels */
1702         for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++) {
1703                 for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, sample++, rd++, ro++, rp++, rz++) {
1704                         if(!(((x - pa->disprect.xmin + step) % step) == 0 || x == pa->disprect.xmax-1))
1705                                 continue;
1706                         if(!(((y - pa->disprect.ymin + step) % step) == 0 || y == pa->disprect.ymax-1))
1707                                 continue;
1708
1709                         if(re->osa) {
1710                                 if(!*rd) continue;
1711
1712                                 shade_samples_fill_with_ps(ssamp, (PixStr *)(*rd), x, y);
1713                         }
1714                         else {
1715                                 if(!*rp) continue;
1716
1717                                 ps.obi= *ro;
1718                                 ps.facenr= *rp;
1719                                 ps.z= *rz;
1720                                 shade_samples_fill_with_ps(ssamp, &ps, x, y);
1721                         }
1722
1723                         shi= ssamp->shi;
1724                         if(shi->vlr) {
1725                                 onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1726                                 exclude.obi= shi->obi - re->objectinstance;
1727                                 exclude.facenr= shi->vlr->index;
1728                                 sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1729
1730                                 VECCOPY(sample->co, shi->co);
1731                                 VECCOPY(sample->n, shi->vno);
1732                                 VECCOPY(sample->col, shi->ao);
1733                                 sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1734                                 sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1735                                 sample->x= shi->xs;
1736                                 sample->y= shi->ys;
1737                                 sample->filled= 1;
1738                         }
1739
1740                         if(re->test_break())
1741                                 break;
1742                 }
1743         }
1744 }
1745
1746 void free_occ_samples(Render *re, RenderPart *pa)
1747 {
1748         OcclusionTree *tree= re->occlusiontree;
1749         OcclusionCache *cache;
1750
1751         if(tree->cache) {
1752                 cache= &tree->cache[pa->thread];
1753
1754                 if(cache->sample)
1755                         MEM_freeN(cache->sample);
1756
1757                 cache->w= 0;
1758                 cache->h= 0;
1759                 cache->step= 0;
1760         }
1761 }
1762