Fix for bug #18032: AAO pixel cache + refraction artifacts, the pixel
[blender.git] / source / blender / render / intern / source / occlusion.c
1 /* 
2  * $Id$
3  *
4  * ***** BEGIN GPL LICENSE BLOCK *****
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software Foundation,
18  * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19  *
20  * The Original Code is Copyright (C) 2008 Blender Foundation.
21  * All rights reserved.
22  *
23  * The Original Code is: all of this file.
24  *
25  * Contributor(s): Brecht Van Lommel.
26  *
27  * ***** END GPL LICENSE BLOCK *****
28  */
29
30 #include <math.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "MEM_guardedalloc.h"
36
37 #include "DNA_material_types.h"
38
39 #include "BLI_arithb.h"
40 #include "BLI_blenlib.h"
41 #include "BLI_memarena.h"
42 #include "BLI_threads.h"
43
44 #include "BKE_global.h"
45 #include "BKE_scene.h"
46 #include "BKE_utildefines.h"
47
48 #include "RE_shader_ext.h"
49
50 /* local includes */
51 #include "occlusion.h"
52 #include "render_types.h"
53 #include "rendercore.h"
54 #include "renderdatabase.h"
55 #include "pixelshading.h"
56 #include "shading.h"
57 #include "zbuf.h"
58
59 /* ------------------------- Declarations --------------------------- */
60
61 #define INVALID_INDEX ((int)(~0))
62 #define INVPI 0.31830988618379069f
63 #define TOTCHILD 8
64 #define CACHE_STEP 3
65
66 typedef struct OcclusionCacheSample {
67         float co[3], n[3], col[3], intensity, dist2;
68         int x, y, filled;
69 } OcclusionCacheSample;
70
71 typedef struct OcclusionCache {
72         OcclusionCacheSample *sample;
73         int x, y, w, h, step;
74 } OcclusionCache;
75
76 typedef struct OccFace {
77         int obi;
78         int facenr;
79 } OccFace;
80
81 typedef struct OccNode {
82         float co[3], area;
83         float sh[9], dco;
84         float occlusion;
85         int childflag;
86         union {
87                 //OccFace face;
88                 int face;
89                 struct OccNode *node;
90         } child[TOTCHILD];
91 } OccNode;
92
93 typedef struct OcclusionTree {
94         MemArena *arena;
95
96         float (*co)[3];         /* temporary during build */
97
98         OccFace *face;          /* instance and face indices */
99         float *occlusion;       /* occlusion for faces */
100         
101         OccNode *root;
102
103         OccNode **stack[BLENDER_MAX_THREADS];
104         int maxdepth;
105
106         int totface;
107
108         float error;
109         float distfac;
110
111         int dothreadedbuild;
112         int totbuildthread;
113
114         OcclusionCache *cache;
115 } OcclusionTree;
116
117 typedef struct OcclusionThread {
118         Render *re;
119         StrandSurface *mesh;
120         float (*facecol)[3];
121         int begin, end;
122         int thread;
123 } OcclusionThread;
124
125 typedef struct OcclusionBuildThread {
126         OcclusionTree *tree;
127         int begin, end, depth;
128         OccNode *node;
129 } OcclusionBuildThread;
130
131 /* ------------------------- Shading --------------------------- */
132
133 extern Render R; // meh
134
135 #if 0
136 static void occ_shade(ShadeSample *ssamp, ObjectInstanceRen *obi, VlakRen *vlr, float *rad)
137 {
138         ShadeInput *shi= ssamp->shi;
139         ShadeResult *shr= ssamp->shr;
140         float l, u, v, *v1, *v2, *v3;
141         
142         /* init */
143         if(vlr->v4) {
144                 shi->u= u= 0.5f;
145                 shi->v= v= 0.5f;
146         }
147         else {
148                 shi->u= u= 1.0f/3.0f;
149                 shi->v= v= 1.0f/3.0f;
150         }
151
152         /* setup render coordinates */
153         v1= vlr->v1->co;
154         v2= vlr->v2->co;
155         v3= vlr->v3->co;
156         
157         /* renderco */
158         l= 1.0f-u-v;
159         
160         shi->co[0]= l*v3[0]+u*v1[0]+v*v2[0];
161         shi->co[1]= l*v3[1]+u*v1[1]+v*v2[1];
162         shi->co[2]= l*v3[2]+u*v1[2]+v*v2[2];
163         
164         shade_input_set_triangle_i(shi, obi, vlr, 0, 1, 2);
165
166         /* set up view vector */
167         VECCOPY(shi->view, shi->co);
168         Normalize(shi->view);
169         
170         /* cache for shadow */
171         shi->samplenr++;
172         
173         shi->xs= 0; // TODO
174         shi->ys= 0;
175         
176         shade_input_set_normals(shi);
177
178         /* no normal flip */
179         if(shi->flippednor)
180                 shade_input_flip_normals(shi);
181
182         /* not a pretty solution, but fixes common cases */
183         if(shi->obr->ob && shi->obr->ob->transflag & OB_NEG_SCALE) {
184                 VecNegf(shi->vn);
185                 VecNegf(shi->vno);
186         }
187
188         /* init material vars */
189         // note, keep this synced with render_types.h
190         memcpy(&shi->r, &shi->mat->r, 23*sizeof(float));
191         shi->har= shi->mat->har;
192         
193         /* render */
194         shade_input_set_shade_texco(shi);
195         shade_material_loop(shi, shr); /* todo: nodes */
196         
197         VECCOPY(rad, shr->combined);
198 }
199
200 static void occ_build_shade(Render *re, OcclusionTree *tree)
201 {
202         ShadeSample ssamp;
203         ObjectInstanceRen *obi;
204         VlakRen *vlr;
205         int a;
206
207         R= *re;
208
209         /* setup shade sample with correct passes */
210         memset(&ssamp, 0, sizeof(ShadeSample));
211         ssamp.shi[0].lay= re->scene->lay;
212         ssamp.shi[0].passflag= SCE_PASS_DIFFUSE|SCE_PASS_RGBA;
213         ssamp.shi[0].combinedflag= ~(SCE_PASS_SPEC);
214         ssamp.tot= 1;
215
216         for(a=0; a<tree->totface; a++) {
217                 obi= &R.objectinstance[tree->face[a].obi];
218                 vlr= RE_findOrAddVlak(obi->obr, tree->face[a].vlr);
219
220                 occ_shade(&ssamp, obi, vlr, tree->rad[a]);
221         }
222 }
223 #endif
224
225 /* ------------------------- Spherical Harmonics --------------------------- */
226
227 /* Use 2nd order SH => 9 coefficients, stored in this order:
228    0 = (0,0),
229    1 = (1,-1), 2 = (1,0), 3 = (1,1),
230    4 = (2,-2), 5 = (2,-1), 6 = (2,0), 7 = (2,1), 8 = (2,2) */
231
232 static void sh_copy(float *shresult, float *sh)
233 {
234         memcpy(shresult, sh, sizeof(float)*9);
235 }
236
237 static void sh_mul(float *sh, float f)
238 {
239         int i;
240
241         for(i=0; i<9; i++)
242                 sh[i] *= f;
243 }
244
245 static void sh_add(float *shresult, float *sh1, float *sh2)
246 {
247         int i;
248
249         for(i=0; i<9; i++)
250                 shresult[i]= sh1[i] + sh2[i];
251 }
252
253 static void sh_from_disc(float *n, float area, float *shresult)
254 {
255         /* See formula (3) in:
256            "An Efficient Representation for Irradiance Environment Maps" */
257         float sh[9], x, y, z;
258
259         x= n[0];
260         y= n[1];
261         z= n[2];
262
263         sh[0]= 0.282095f;
264
265         sh[1]= 0.488603f*y;
266         sh[2]= 0.488603f*z;
267         sh[3]= 0.488603f*x;
268         
269         sh[4]= 1.092548f*x*y;
270         sh[5]= 1.092548f*y*z;
271         sh[6]= 0.315392f*(3.0f*z*z - 1.0f);
272         sh[7]= 1.092548f*x*z;
273         sh[8]= 0.546274f*(x*x - y*y);
274
275         sh_mul(sh, area);
276         sh_copy(shresult, sh);
277 }
278
279 static float sh_eval(float *sh, float *v)
280 {
281         /* See formula (13) in:
282            "An Efficient Representation for Irradiance Environment Maps" */
283         static const float c1 = 0.429043f, c2 = 0.511664f, c3 = 0.743125f;
284         static const float c4 = 0.886227f, c5 = 0.247708f;
285         float x, y, z, sum;
286
287         x= v[0];
288         y= v[1];
289         z= v[2];
290
291         sum= c1*sh[8]*(x*x - y*y);
292         sum += c3*sh[6]*z*z;
293         sum += c4*sh[0];
294         sum += -c5*sh[6];
295         sum += 2.0f*c1*(sh[4]*x*y + sh[7]*x*z + sh[5]*y*z);
296         sum += 2.0f*c2*(sh[3]*x + sh[1]*y + sh[2]*z);
297
298         return sum;
299 }
300
301 /* ------------------------------ Building --------------------------------- */
302
303 static void occ_face(const OccFace *face, float *co, float *normal, float *area)
304 {
305         ObjectInstanceRen *obi;
306         VlakRen *vlr;
307         float v1[3], v2[3], v3[3], v4[3];
308
309         obi= &R.objectinstance[face->obi];
310         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
311         
312         if(co) {
313                 if(vlr->v4)
314                         VecLerpf(co, vlr->v1->co, vlr->v3->co, 0.5f);
315                 else
316                         CalcCent3f(co, vlr->v1->co, vlr->v2->co, vlr->v3->co);
317
318                 if(obi->flag & R_TRANSFORMED)
319                         Mat4MulVecfl(obi->mat, co);
320         }
321         
322         if(normal) {
323                 normal[0]= -vlr->n[0];
324                 normal[1]= -vlr->n[1];
325                 normal[2]= -vlr->n[2];
326
327                 if(obi->flag & R_TRANSFORMED)
328                         Mat3MulVecfl(obi->nmat, normal);
329         }
330
331         if(area) {
332                 VECCOPY(v1, vlr->v1->co);
333                 VECCOPY(v2, vlr->v2->co);
334                 VECCOPY(v3, vlr->v3->co);
335                 if(vlr->v4) VECCOPY(v4, vlr->v4->co);
336
337                 if(obi->flag & R_TRANSFORMED) {
338                         Mat4MulVecfl(obi->mat, v1);
339                         Mat4MulVecfl(obi->mat, v2);
340                         Mat4MulVecfl(obi->mat, v3);
341                         if(vlr->v4) Mat4MulVecfl(obi->mat, v4);
342                 }
343
344                 /* todo: correct area for instances */
345                 if(vlr->v4)
346                         *area= AreaQ3Dfl(v1, v2, v3, v4);
347                 else
348                         *area= AreaT3Dfl(v1, v2, v3);
349         }
350 }
351
352 static void occ_sum_occlusion(OcclusionTree *tree, OccNode *node)
353 {
354         OccNode *child;
355         float occ, area, totarea;
356         int a, b;
357
358         occ= 0.0f;
359         totarea= 0.0f;
360
361         for(b=0; b<TOTCHILD; b++) {
362                 if(node->childflag & (1<<b)) {
363                         a= node->child[b].face;
364                         occ_face(&tree->face[a], 0, 0, &area);
365                         occ += area*tree->occlusion[a];
366                         totarea += area;
367                 }
368                 else if(node->child[b].node) {
369                         child= node->child[b].node;
370                         occ_sum_occlusion(tree, child);
371
372                         occ += child->area*child->occlusion;
373                         totarea += child->area;
374                 }
375         }
376
377         if(totarea != 0.0f)
378                 occ /= totarea;
379         
380         node->occlusion= occ;
381 }
382
383 static int occ_find_bbox_axis(OcclusionTree *tree, int begin, int end, float *min, float *max)
384 {
385         float len, maxlen= -1.0f;
386         int a, axis = 0;
387
388         INIT_MINMAX(min, max);
389
390         for(a=begin; a<end; a++)
391                 DO_MINMAX(tree->co[a], min, max)
392
393         for(a=0; a<3; a++) {
394                 len= max[a] - min[a];
395
396                 if(len > maxlen) {
397                         maxlen= len;
398                         axis= a;
399                 }
400         }
401
402         return axis;
403 }
404
405 static void occ_node_from_face(OccFace *face, OccNode *node)
406 {
407         float n[3];
408
409         occ_face(face, node->co, n, &node->area);
410         node->dco= 0.0f;
411         sh_from_disc(n, node->area, node->sh);
412 }
413
414 static void occ_build_dco(OcclusionTree *tree, OccNode *node, float *co, float *dco)
415 {
416         OccNode *child;
417         float dist, d[3], nco[3];
418         int b;
419
420         for(b=0; b<TOTCHILD; b++) {
421                 if(node->childflag & (1<<b)) {
422                         occ_face(tree->face+node->child[b].face, nco, 0, 0);
423                 }
424                 else if(node->child[b].node) {
425                         child= node->child[b].node;
426                         occ_build_dco(tree, child, co, dco);
427                         VECCOPY(nco, child->co);
428                 }
429
430                 VECSUB(d, nco, co);
431                 dist= INPR(d, d);
432                 if(dist > *dco)
433                         *dco= dist;
434         }
435 }
436
437 static void occ_build_split(OcclusionTree *tree, int begin, int end, int *split)
438 {
439         float min[3], max[3], mid;
440         int axis, a, enda;
441
442         /* split in middle of boundbox. this seems faster than median split
443          * on complex scenes, possibly since it avoids two distant faces to
444          * be in the same node better? */
445         axis= occ_find_bbox_axis(tree, begin, end, min, max);
446         mid= 0.5f*(min[axis]+max[axis]);
447
448         a= begin;
449         enda= end;
450         while(a<enda) {
451                 if(tree->co[a][axis] > mid) {
452                         enda--;
453                         SWAP(OccFace, tree->face[a], tree->face[enda]);
454                         SWAP(float, tree->co[a][0], tree->co[enda][0]);
455                         SWAP(float, tree->co[a][1], tree->co[enda][1]);
456                         SWAP(float, tree->co[a][2], tree->co[enda][2]);
457                 }
458                 else
459                         a++;
460         }
461
462         *split= enda;
463 }
464
465 static void occ_build_8_split(OcclusionTree *tree, int begin, int end, int *offset, int *count)
466 {
467         /* split faces into eight groups */
468         int b, splitx, splity[2], splitz[4];
469
470         occ_build_split(tree, begin, end, &splitx);
471
472         /* force split if none found, to deal with degenerate geometry */
473         if(splitx == begin || splitx == end)
474                 splitx= (begin+end)/2;
475
476         occ_build_split(tree, begin, splitx, &splity[0]);
477         occ_build_split(tree, splitx, end, &splity[1]);
478
479         occ_build_split(tree, begin, splity[0], &splitz[0]);
480         occ_build_split(tree, splity[0], splitx, &splitz[1]);
481         occ_build_split(tree, splitx, splity[1], &splitz[2]);
482         occ_build_split(tree, splity[1], end, &splitz[3]);
483
484         offset[0]= begin;
485         offset[1]= splitz[0];
486         offset[2]= splity[0];
487         offset[3]= splitz[1];
488         offset[4]= splitx;
489         offset[5]= splitz[2];
490         offset[6]= splity[1];
491         offset[7]= splitz[3];
492
493         for(b=0; b<7; b++)
494                 count[b]= offset[b+1] - offset[b];
495         count[7]= end - offset[7];
496 }
497
498 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth);
499
500 static void *exec_occ_build(void *data)
501 {
502         OcclusionBuildThread *othread= (OcclusionBuildThread*)data;
503
504         occ_build_recursive(othread->tree, othread->node, othread->begin, othread->end, othread->depth);
505
506         return 0;
507 }
508
509 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth)
510 {
511         ListBase threads;
512         OcclusionBuildThread othreads[BLENDER_MAX_THREADS];
513         OccNode *child, tmpnode;
514         OccFace *face;
515         int a, b, totthread=0, offset[TOTCHILD], count[TOTCHILD];
516
517         /* keep track of maximum depth for stack */
518         if(depth > tree->maxdepth)
519                 tree->maxdepth= depth;
520
521         /* add a new node */
522         node->occlusion= 1.0f;
523
524         /* leaf node with only children */
525         if(end - begin <= TOTCHILD) {
526                 for(a=begin, b=0; a<end; a++, b++) {
527                         face= &tree->face[a];
528                         node->child[b].face= a;
529                         node->childflag |= (1<<b);
530                 }
531         }
532         else {
533                 /* order faces */
534                 occ_build_8_split(tree, begin, end, offset, count);
535
536                 if(depth == 1 && tree->dothreadedbuild)
537                         BLI_init_threads(&threads, exec_occ_build, tree->totbuildthread);
538
539                 for(b=0; b<TOTCHILD; b++) {
540                         if(count[b] == 0) {
541                                 node->child[b].node= NULL;
542                         }
543                         else if(count[b] == 1) {
544                                 face= &tree->face[offset[b]];
545                                 node->child[b].face= offset[b];
546                                 node->childflag |= (1<<b);
547                         }
548                         else {
549                                 if(tree->dothreadedbuild)
550                                         BLI_lock_thread(LOCK_CUSTOM1);
551
552                                 child= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
553                                 node->child[b].node= child;
554
555                                 if(tree->dothreadedbuild)
556                                         BLI_unlock_thread(LOCK_CUSTOM1);
557
558                                 if(depth == 1 && tree->dothreadedbuild) {
559                                         othreads[totthread].tree= tree;
560                                         othreads[totthread].node= child;
561                                         othreads[totthread].begin= offset[b];
562                                         othreads[totthread].end= offset[b]+count[b];
563                                         othreads[totthread].depth= depth+1;
564                                         BLI_insert_thread(&threads, &othreads[totthread]);
565                                         totthread++;
566                                 }
567                                 else
568                                         occ_build_recursive(tree, child, offset[b], offset[b]+count[b], depth+1);
569                         }
570                 }
571
572                 if(depth == 1 && tree->dothreadedbuild)
573                         BLI_end_threads(&threads);
574         }
575
576         /* combine area, position and sh */
577         for(b=0; b<TOTCHILD; b++) {
578                 if(node->childflag & (1<<b)) {
579                         child= &tmpnode;
580                         occ_node_from_face(tree->face+node->child[b].face, &tmpnode);
581                 }
582                 else {
583                         child= node->child[b].node;
584                 }
585
586                 if(child) {
587                         node->area += child->area;
588                         sh_add(node->sh, node->sh, child->sh);
589                         VECADDFAC(node->co, node->co, child->co, child->area);
590                 }
591         }
592
593         if(node->area != 0.0f)
594                 VecMulf(node->co, 1.0f/node->area);
595
596         /* compute maximum distance from center */
597         node->dco= 0.0f;
598         occ_build_dco(tree, node, node->co, &node->dco);
599 }
600
601 static void occ_build_sh_normalize(OccNode *node)
602 {
603         /* normalize spherical harmonics to not include area, so
604          * we can clamp the dot product and then mutliply by area */
605         int b;
606
607         if(node->area != 0.0f)
608                 sh_mul(node->sh, 1.0f/node->area);
609
610         for(b=0; b<TOTCHILD; b++) {
611                 if(node->childflag & (1<<b));
612                 else if(node->child[b].node)
613                         occ_build_sh_normalize(node->child[b].node);
614         }
615 }
616
617 static OcclusionTree *occ_tree_build(Render *re)
618 {
619         OcclusionTree *tree;
620         ObjectInstanceRen *obi;
621         ObjectRen *obr;
622         VlakRen *vlr= NULL;
623         int a, b, c, totface;
624
625         /* count */
626         totface= 0;
627         for(obi=re->instancetable.first; obi; obi=obi->next) {
628                 obr= obi->obr;
629                 for(a=0; a<obr->totvlak; a++) {
630                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
631                         else vlr++;
632
633                         if(vlr->mat->mode & MA_TRACEBLE)
634                                 totface++;
635                 }
636         }
637
638         if(totface == 0)
639                 return NULL;
640         
641         tree= MEM_callocN(sizeof(OcclusionTree), "OcclusionTree");
642         tree->totface= totface;
643
644         /* parameters */
645         tree->error= get_render_aosss_error(&re->r, re->wrld.ao_approx_error);
646         tree->distfac= (re->wrld.aomode & WO_AODIST)? re->wrld.aodistfac: 0.0f;
647
648         /* allocation */
649         tree->arena= BLI_memarena_new(0x8000 * sizeof(OccNode));
650         BLI_memarena_use_calloc(tree->arena);
651
652         if(re->wrld.aomode & WO_AOCACHE)
653                 tree->cache= MEM_callocN(sizeof(OcclusionCache)*BLENDER_MAX_THREADS, "OcclusionCache");
654
655         tree->face= MEM_callocN(sizeof(OccFace)*totface, "OcclusionFace");
656         tree->co= MEM_callocN(sizeof(float)*3*totface, "OcclusionCo");
657         tree->occlusion= MEM_callocN(sizeof(float)*totface, "OcclusionOcclusion");
658
659         /* make array of face pointers */
660         for(b=0, c=0, obi=re->instancetable.first; obi; obi=obi->next, c++) {
661                 obr= obi->obr;
662                 for(a=0; a<obr->totvlak; a++) {
663                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
664                         else vlr++;
665
666                         if(vlr->mat->mode & MA_TRACEBLE) {
667                                 tree->face[b].obi= c;
668                                 tree->face[b].facenr= a;
669                                 tree->occlusion[b]= 1.0f;
670                                 occ_face(&tree->face[b], tree->co[b], NULL, NULL); 
671                                 b++;
672                         }
673                 }
674         }
675
676         /* threads */
677         tree->totbuildthread= (re->r.threads > 1 && totface > 10000)? 8: 1;
678         tree->dothreadedbuild= (tree->totbuildthread > 1);
679
680         /* recurse */
681         tree->root= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
682         occ_build_recursive(tree, tree->root, 0, totface, 1);
683
684 #if 0
685         if(tree->doindirect) {
686                 occ_build_shade(re, tree);
687                 occ_sum_occlusion(tree, tree->root);
688         }
689 #endif
690         
691         MEM_freeN(tree->co);
692         tree->co= NULL;
693
694         occ_build_sh_normalize(tree->root);
695
696         for(a=0; a<BLENDER_MAX_THREADS; a++)
697                 tree->stack[a]= MEM_callocN(sizeof(OccNode)*TOTCHILD*(tree->maxdepth+1), "OccStack");
698
699         return tree;
700 }
701
702 static void occ_free_tree(OcclusionTree *tree)
703 {
704         int a;
705
706         if(tree) {
707                 if(tree->arena) BLI_memarena_free(tree->arena);
708                 for(a=0; a<BLENDER_MAX_THREADS; a++)
709                         if(tree->stack[a])
710                                 MEM_freeN(tree->stack[a]);
711                 if(tree->occlusion) MEM_freeN(tree->occlusion);
712                 if(tree->face) MEM_freeN(tree->face);
713                 if(tree->cache) MEM_freeN(tree->cache);
714                 MEM_freeN(tree);
715         }
716 }
717
718 /* ------------------------- Traversal --------------------------- */
719
720 static float occ_solid_angle(OccNode *node, float *v, float d2, float invd2, float *receivenormal)
721 {
722         float dotreceive, dotemit;
723         float ev[3];
724
725         ev[0]= -v[0]*invd2;
726         ev[1]= -v[1]*invd2;
727         ev[2]= -v[2]*invd2;
728         dotemit= sh_eval(node->sh, ev);
729         dotreceive= INPR(receivenormal, v)*invd2;
730
731         CLAMP(dotemit, 0.0f, 1.0f);
732         CLAMP(dotreceive, 0.0f, 1.0f);
733         
734         return ((node->area*dotemit*dotreceive)/(d2 + node->area*INVPI))*INVPI;
735 }
736
737 static void VecAddDir(float *result, float *v1, float *v2, float fac)
738 {
739         result[0]= v1[0] + fac*(v2[0] - v1[0]);
740         result[1]= v1[1] + fac*(v2[1] - v1[1]);
741         result[2]= v1[2] + fac*(v2[2] - v1[2]);
742 }
743
744 static int occ_visible_quad(float *p, float *n, float *v0, float *v1, float *v2, float *q0, float *q1, float *q2, float *q3)
745 {
746         static const float epsilon = 1e-6f;
747         float c, sd[3];
748         
749         c= INPR(n, p);
750
751         /* signed distances from the vertices to the plane. */
752         sd[0]= INPR(n, v0) - c;
753         sd[1]= INPR(n, v1) - c;
754         sd[2]= INPR(n, v2) - c;
755
756         if(fabs(sd[0]) < epsilon) sd[0] = 0.0f;
757         if(fabs(sd[1]) < epsilon) sd[1] = 0.0f;
758         if(fabs(sd[2]) < epsilon) sd[2] = 0.0f;
759
760         if(sd[0] > 0) {
761                 if(sd[1] > 0) {
762                         if(sd[2] > 0) {
763                                 // +++
764                                 VECCOPY(q0, v0);
765                                 VECCOPY(q1, v1);
766                                 VECCOPY(q2, v2);
767                                 VECCOPY(q3, q2);
768                         }
769                         else if(sd[2] < 0) {
770                                 // ++-
771                                 VECCOPY(q0, v0);
772                                 VECCOPY(q1, v1);
773                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
774                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
775                         }
776                         else {
777                                 // ++0
778                                 VECCOPY(q0, v0);
779                                 VECCOPY(q1, v1);
780                                 VECCOPY(q2, v2);
781                                 VECCOPY(q3, q2);
782                         }
783                 }
784                 else if(sd[1] < 0) {
785                         if(sd[2] > 0) {
786                                 // +-+
787                                 VECCOPY(q0, v0);
788                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
789                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
790                                 VECCOPY(q3, v2);
791                         }
792                         else if(sd[2] < 0) {
793                                 // +--
794                                 VECCOPY(q0, v0);
795                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
796                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
797                                 VECCOPY(q3, q2);
798                         }
799                         else {
800                                 // +-0
801                                 VECCOPY(q0, v0);
802                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
803                                 VECCOPY(q2, v2);
804                                 VECCOPY(q3, q2);
805                         }
806                 }
807                 else {
808                         if(sd[2] > 0) {
809                                 // +0+
810                                 VECCOPY(q0, v0);
811                                 VECCOPY(q1, v1);
812                                 VECCOPY(q2, v2);
813                                 VECCOPY(q3, q2);
814                         }
815                         else if(sd[2] < 0) {
816                                 // +0-
817                                 VECCOPY(q0, v0);
818                                 VECCOPY(q1, v1);
819                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
820                                 VECCOPY(q3, q2);
821                         }
822                         else {
823                                 // +00
824                                 VECCOPY(q0, v0);
825                                 VECCOPY(q1, v1);
826                                 VECCOPY(q2, v2);
827                                 VECCOPY(q3, q2);
828                         }
829                 }
830         }
831         else if(sd[0] < 0) {
832                 if(sd[1] > 0) {
833                         if(sd[2] > 0) {
834                                 // -++
835                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
836                                 VECCOPY(q1, v1);
837                                 VECCOPY(q2, v2);
838                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
839                         }
840                         else if(sd[2] < 0) {
841                                 // -+-
842                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
843                                 VECCOPY(q1, v1);
844                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
845                                 VECCOPY(q3, q2);
846                         }
847                         else {
848                                 // -+0
849                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
850                                 VECCOPY(q1, v1);
851                                 VECCOPY(q2, v2);
852                                 VECCOPY(q3, q2);
853                         }
854                 }
855                 else if(sd[1] < 0) {
856                         if(sd[2] > 0) {
857                                 // --+
858                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
859                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
860                                 VECCOPY(q2, v2);
861                                 VECCOPY(q3, q2);
862                         }
863                         else if(sd[2] < 0) {
864                                 // ---
865                                 return 0;
866                         }
867                         else {
868                                 // --0
869                                 return 0;
870                         }
871                 }
872                 else {
873                         if(sd[2] > 0) {
874                                 // -0+
875                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
876                                 VECCOPY(q1, v1);
877                                 VECCOPY(q2, v2);
878                                 VECCOPY(q3, q2);
879                         }
880                         else if(sd[2] < 0) {
881                                 // -0-
882                                 return 0;
883                         }
884                         else {
885                                 // -00
886                                 return 0;
887                         }
888                 }
889         }
890         else {
891                 if(sd[1] > 0) {
892                         if(sd[2] > 0) {
893                                 // 0++
894                                 VECCOPY(q0, v0);
895                                 VECCOPY(q1, v1);
896                                 VECCOPY(q2, v2);
897                                 VECCOPY(q3, q2);
898                         }
899                         else if(sd[2] < 0) {
900                                 // 0+-
901                                 VECCOPY(q0, v0);
902                                 VECCOPY(q1, v1);
903                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
904                                 VECCOPY(q3, q2);
905                         }
906                         else {
907                                 // 0+0
908                                 VECCOPY(q0, v0);
909                                 VECCOPY(q1, v1);
910                                 VECCOPY(q2, v2);
911                                 VECCOPY(q3, q2);
912                         }
913                 }
914                 else if(sd[1] < 0) {
915                         if(sd[2] > 0) {
916                                 // 0-+
917                                 VECCOPY(q0, v0);
918                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
919                                 VECCOPY(q2, v2);
920                                 VECCOPY(q3, q2);
921                         }
922                         else if(sd[2] < 0) {
923                                 // 0--
924                                 return 0;
925                         }
926                         else {
927                                 // 0-0
928                                 return 0;
929                         }
930                 }
931                 else {
932                         if(sd[2] > 0) {
933                                 // 00+
934                                 VECCOPY(q0, v0);
935                                 VECCOPY(q1, v1);
936                                 VECCOPY(q2, v2);
937                                 VECCOPY(q3, q2);
938                         }
939                         else if(sd[2] < 0) {
940                                 // 00-
941                                 return 0;
942                         }
943                         else {
944                                 // 000
945                                 return 0;
946                         }
947                 }
948         }
949
950         return 1;
951 }
952
953 /* altivec optimization, this works, but is unused */
954
955 #if 0
956 #include <Accelerate/Accelerate.h>
957
958 typedef union {
959         vFloat v;
960         float f[4];
961 } vFloatResult;
962
963 static vFloat vec_splat_float(float val)
964 {
965         return (vFloat){val, val, val, val};
966 }
967
968 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
969 {
970         vFloat vcos, rlen, vrx, vry, vrz, vsrx, vsry, vsrz, gx, gy, gz, vangle;
971         vUInt8 rotate = (vUInt8){4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3};
972         vFloatResult vresult;
973         float result;
974
975         /* compute r* */
976         vrx = (vFloat){q0[0], q1[0], q2[0], q3[0]} - vec_splat_float(p[0]);
977         vry = (vFloat){q0[1], q1[1], q2[1], q3[1]} - vec_splat_float(p[1]);
978         vrz = (vFloat){q0[2], q1[2], q2[2], q3[2]} - vec_splat_float(p[2]);
979
980         /* normalize r* */
981         rlen = vec_rsqrte(vrx*vrx + vry*vry + vrz*vrz + vec_splat_float(1e-16f));
982         vrx = vrx*rlen;
983         vry = vry*rlen;
984         vrz = vrz*rlen;
985
986         /* rotate r* for cross and dot */
987         vsrx= vec_perm(vrx, vrx, rotate);
988         vsry= vec_perm(vry, vry, rotate);
989         vsrz= vec_perm(vrz, vrz, rotate);
990
991         /* cross product */
992         gx = vsry*vrz - vsrz*vry;
993         gy = vsrz*vrx - vsrx*vrz;
994         gz = vsrx*vry - vsry*vrx;
995
996         /* normalize */
997         rlen = vec_rsqrte(gx*gx + gy*gy + gz*gz + vec_splat_float(1e-16f));
998         gx = gx*rlen;
999         gy = gy*rlen;
1000         gz = gz*rlen;
1001
1002         /* angle */
1003         vcos = vrx*vsrx + vry*vsry + vrz*vsrz;
1004         vcos= vec_max(vec_min(vcos, vec_splat_float(1.0f)), vec_splat_float(-1.0f));
1005         vangle= vacosf(vcos);
1006
1007         /* dot */
1008         vresult.v = (vec_splat_float(n[0])*gx +
1009                      vec_splat_float(n[1])*gy +
1010                      vec_splat_float(n[2])*gz)*vangle;
1011
1012         result= (vresult.f[0] + vresult.f[1] + vresult.f[2] + vresult.f[3])*(0.5f/(float)M_PI);
1013         result= MAX2(result, 0.0f);
1014
1015         return result;
1016 }
1017
1018 #endif
1019
1020 /* SSE optimization, acos code doesn't work */
1021
1022 #if 0
1023
1024 #include <xmmintrin.h>
1025
1026 static __m128 sse_approx_acos(__m128 x)
1027 {
1028         /* needs a better approximation than taylor expansion of acos, since that
1029          * gives big erros for near 1.0 values, sqrt(2*x)*acos(1-x) should work
1030          * better, see http://www.tom.womack.net/projects/sse-fast-arctrig.html */
1031
1032         return _mm_set_ps1(1.0f);
1033 }
1034
1035 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1036 {
1037         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1038         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1039         float fresult[4] __attribute__((aligned(16)));
1040         __m128 qx, qy, qz, rx, ry, rz, rlen, srx, sry, srz, gx, gy, gz, glen, rcos, angle, aresult;
1041
1042         /* compute r */
1043         qx = _mm_set_ps(q3[0], q2[0], q1[0], q0[0]);
1044         qy = _mm_set_ps(q3[1], q2[1], q1[1], q0[1]);
1045         qz = _mm_set_ps(q3[2], q2[2], q1[2], q0[2]);
1046
1047         rx = qx - _mm_set_ps1(p[0]);
1048         ry = qy - _mm_set_ps1(p[1]);
1049         rz = qz - _mm_set_ps1(p[2]);
1050
1051         /* normalize r */
1052         rlen = _mm_rsqrt_ps(rx*rx + ry*ry + rz*rz + _mm_set_ps1(1e-16f));
1053         rx = rx*rlen;
1054         ry = ry*rlen;
1055         rz = rz*rlen;
1056
1057         /* cross product */
1058         srx = _mm_shuffle_ps(rx, rx, _MM_SHUFFLE(0,3,2,1));
1059         sry = _mm_shuffle_ps(ry, ry, _MM_SHUFFLE(0,3,2,1));
1060         srz = _mm_shuffle_ps(rz, rz, _MM_SHUFFLE(0,3,2,1));
1061
1062         gx = sry*rz - srz*ry;
1063         gy = srz*rx - srx*rz;
1064         gz = srx*ry - sry*rx;
1065
1066         /* normalize g */
1067         glen = _mm_rsqrt_ps(gx*gx + gy*gy + gz*gz + _mm_set_ps1(1e-16f));
1068         gx = gx*glen;
1069         gy = gy*glen;
1070         gz = gz*glen;
1071
1072         /* compute angle */
1073         rcos = rx*srx + ry*sry + rz*srz;
1074         rcos= _mm_max_ps(_mm_min_ps(rcos, _mm_set_ps1(1.0f)), _mm_set_ps1(-1.0f));
1075
1076         angle = sse_approx_cos(rcos);
1077         aresult = (_mm_set_ps1(n[0])*gx + _mm_set_ps1(n[1])*gy + _mm_set_ps1(n[2])*gz)*angle;
1078
1079         /* sum together */
1080     result= (fresult[0] + fresult[1] + fresult[2] + fresult[3])*(0.5f/(float)M_PI);
1081         result= MAX2(result, 0.0f);
1082
1083         return result;
1084 }
1085
1086 #endif
1087
1088 static float saacosf(float fac)
1089 {
1090         if(fac<= -1.0f) return (float)M_PI;
1091         else if(fac>=1.0f) return 0.0f;
1092         else return acos(fac); /* acosf(fac) */
1093 }
1094
1095 static void normalizef(float *n)
1096 {
1097         float d;
1098         
1099         d= INPR(n, n);
1100
1101         if(d > 1.0e-35F) {
1102                 d= 1.0f/sqrt(d); /* sqrtf(d) */
1103
1104                 n[0] *= d; 
1105                 n[1] *= d; 
1106                 n[2] *= d;
1107         } 
1108 }
1109
1110 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1111 {
1112         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1113         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1114
1115         VECSUB(r0, q0, p);
1116         VECSUB(r1, q1, p);
1117         VECSUB(r2, q2, p);
1118         VECSUB(r3, q3, p);
1119
1120         normalizef(r0);
1121         normalizef(r1);
1122         normalizef(r2);
1123         normalizef(r3);
1124
1125         Crossf(g0, r1, r0); normalizef(g0);
1126         Crossf(g1, r2, r1); normalizef(g1);
1127         Crossf(g2, r3, r2); normalizef(g2);
1128         Crossf(g3, r0, r3); normalizef(g3);
1129
1130         a1= saacosf(INPR(r0, r1));
1131         a2= saacosf(INPR(r1, r2));
1132         a3= saacosf(INPR(r2, r3));
1133         a4= saacosf(INPR(r3, r0));
1134
1135         dot1= INPR(n, g0);
1136         dot2= INPR(n, g1);
1137         dot3= INPR(n, g2);
1138         dot4= INPR(n, g3);
1139
1140         result= (a1*dot1 + a2*dot2 + a3*dot3 + a4*dot4)*0.5f/(float)M_PI;
1141         result= MAX2(result, 0.0f);
1142
1143         return result;
1144 }
1145
1146 static float occ_form_factor(OccFace *face, float *p, float *n)
1147 {
1148         ObjectInstanceRen *obi;
1149         VlakRen *vlr;
1150         float v1[3], v2[3], v3[3], v4[3], q0[3], q1[3], q2[3], q3[3], contrib= 0.0f;
1151
1152         obi= &R.objectinstance[face->obi];
1153         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
1154
1155         VECCOPY(v1, vlr->v1->co);
1156         VECCOPY(v2, vlr->v2->co);
1157         VECCOPY(v3, vlr->v3->co);
1158
1159         if(obi->flag & R_TRANSFORMED) {
1160                 Mat4MulVecfl(obi->mat, v1);
1161                 Mat4MulVecfl(obi->mat, v2);
1162                 Mat4MulVecfl(obi->mat, v3);
1163         }
1164
1165         if(occ_visible_quad(p, n, v1, v2, v3, q0, q1, q2, q3))
1166                 contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1167
1168         if(vlr->v4) {
1169                 VECCOPY(v4, vlr->v4->co);
1170                 if(obi->flag & R_TRANSFORMED)
1171                         Mat4MulVecfl(obi->mat, v4);
1172
1173                 if(occ_visible_quad(p, n, v1, v3, v4, q0, q1, q2, q3))
1174                         contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1175         }
1176
1177         return contrib;
1178 }
1179
1180 static void occ_lookup(OcclusionTree *tree, int thread, OccFace *exclude, float *pp, float *pn, float *occ, float *bentn)
1181 {
1182         OccNode *node, **stack;
1183         OccFace *face;
1184         float resultocc, v[3], p[3], n[3], co[3], invd2;
1185         float distfac, fac, error, d2, weight, emitarea;
1186         int b, totstack;
1187
1188         /* init variables */
1189         VECCOPY(p, pp);
1190         VECCOPY(n, pn);
1191         VECADDFAC(p, p, n, 1e-4f);
1192
1193         if(bentn)
1194                 VECCOPY(bentn, n);
1195         
1196         error= tree->error;
1197         distfac= tree->distfac;
1198
1199         resultocc= 0.0f;
1200
1201         /* init stack */
1202         stack= tree->stack[thread];
1203         stack[0]= tree->root;
1204         totstack= 1;
1205
1206         while(totstack) {
1207                 /* pop point off the stack */
1208                 node= stack[--totstack];
1209
1210                 VECSUB(v, node->co, p);
1211                 d2= INPR(v, v) + 1e-16f;
1212                 emitarea= MAX2(node->area, node->dco);
1213
1214                 if(d2*error > emitarea) {
1215                         if(distfac != 0.0f) {
1216                                 fac= 1.0f/(1.0f + distfac*d2);
1217                                 if(fac < 0.01f)
1218                                         continue;
1219                         }
1220                         else
1221                                 fac= 1.0f;
1222
1223                         /* accumulate occlusion from spherical harmonics */
1224                         invd2 = 1.0f/sqrt(d2);
1225                         weight= occ_solid_angle(node, v, d2, invd2, n);
1226                         weight *= node->occlusion;
1227
1228                         if(bentn) {
1229                                 bentn[0] -= weight*invd2*v[0];
1230                                 bentn[1] -= weight*invd2*v[1];
1231                                 bentn[2] -= weight*invd2*v[2];
1232                         }
1233
1234                         resultocc += weight*fac;
1235                 }
1236                 else {
1237                         /* traverse into children */
1238                         for(b=0; b<TOTCHILD; b++) {
1239                                 if(node->childflag & (1<<b)) {
1240                                         face= tree->face+node->child[b].face;
1241
1242                                         /* accumulate occlusion with face form factor */
1243                                         if(!exclude || !(face->obi == exclude->obi && face->facenr == exclude->facenr)) {
1244                                                 if(bentn || distfac != 0.0f) {
1245                                                         occ_face(face, co, NULL, NULL); 
1246                                                         VECSUB(v, co, p);
1247                                                         d2= INPR(v, v) + 1e-16f;
1248
1249                                                         fac= (distfac == 0.0f)? 1.0f: 1.0f/(1.0f + distfac*d2);
1250                                                         if(fac < 0.01f)
1251                                                                 continue;
1252                                                 }
1253                                                 else
1254                                                         fac= 1.0f;
1255
1256                                                 weight= occ_form_factor(face, p, n);
1257                                                 weight *= tree->occlusion[node->child[b].face];
1258
1259                                                 if(bentn) {
1260                                                         invd2= 1.0f/sqrt(d2);
1261                                                         bentn[0] -= weight*invd2*v[0];
1262                                                         bentn[1] -= weight*invd2*v[1];
1263                                                         bentn[2] -= weight*invd2*v[2];
1264                                                 }
1265
1266                                                 resultocc += weight*fac;
1267                                         }
1268                                 }
1269                                 else if(node->child[b].node) {
1270                                         /* push child on the stack */
1271                                         stack[totstack++]= node->child[b].node;
1272                                 }
1273                         }
1274                 }
1275         }
1276
1277         if(occ) *occ= resultocc;
1278         if(bentn) Normalize(bentn);
1279 }
1280
1281 static void occ_compute_passes(Render *re, OcclusionTree *tree, int totpass)
1282 {
1283         float *occ, co[3], n[3];
1284         int pass, i;
1285         
1286         occ= MEM_callocN(sizeof(float)*tree->totface, "OcclusionPassOcc");
1287
1288         for(pass=0; pass<totpass; pass++) {
1289                 for(i=0; i<tree->totface; i++) {
1290                         occ_face(&tree->face[i], co, n, NULL);
1291                         VecNegf(n);
1292                         VECADDFAC(co, co, n, 1e-8f);
1293
1294                         occ_lookup(tree, 0, &tree->face[i], co, n, &occ[i], NULL);
1295                         if(re->test_break())
1296                                 break;
1297                 }
1298
1299                 if(re->test_break())
1300                         break;
1301
1302                 for(i=0; i<tree->totface; i++) {
1303                         tree->occlusion[i] -= occ[i]; //MAX2(1.0f-occ[i], 0.0f);
1304                         if(tree->occlusion[i] < 0.0f)
1305                                 tree->occlusion[i]= 0.0f;
1306                 }
1307
1308                 occ_sum_occlusion(tree, tree->root);
1309         }
1310
1311         MEM_freeN(occ);
1312 }
1313
1314 static void sample_occ_tree(Render *re, OcclusionTree *tree, OccFace *exclude, float *co, float *n, int thread, int onlyshadow, float *skycol)
1315 {
1316         float nn[3], bn[3], fac, occ, occlusion, correction;
1317         int aocolor;
1318
1319         aocolor= re->wrld.aocolor;
1320         if(onlyshadow)
1321                 aocolor= WO_AOPLAIN;
1322
1323         VECCOPY(nn, n);
1324         VecNegf(nn);
1325
1326         occ_lookup(tree, thread, exclude, co, nn, &occ, (aocolor)? bn: NULL);
1327
1328         correction= re->wrld.ao_approx_correction;
1329
1330         occlusion= (1.0f-correction)*(1.0f-occ);
1331         CLAMP(occlusion, 0.0f, 1.0f);
1332         if(correction != 0.0f)
1333                 occlusion += correction*exp(-occ);
1334
1335         if(aocolor) {
1336                 /* sky shading using bent normal */
1337                 if(ELEM(aocolor, WO_AOSKYCOL, WO_AOSKYTEX)) {
1338                         fac= 0.5*(1.0f+bn[0]*re->grvec[0]+ bn[1]*re->grvec[1]+ bn[2]*re->grvec[2]);
1339                         skycol[0]= (1.0f-fac)*re->wrld.horr + fac*re->wrld.zenr;
1340                         skycol[1]= (1.0f-fac)*re->wrld.horg + fac*re->wrld.zeng;
1341                         skycol[2]= (1.0f-fac)*re->wrld.horb + fac*re->wrld.zenb;
1342                 }
1343 #if 0
1344                 else {  /* WO_AOSKYTEX */
1345                         float dxyview[3];
1346                         bn[0]= -bn[0];
1347                         bn[1]= -bn[1];
1348                         bn[2]= -bn[2];
1349                         dxyview[0]= 1.0f;
1350                         dxyview[1]= 1.0f;
1351                         dxyview[2]= 0.0f;
1352                         shadeSkyView(skycol, co, bn, dxyview);
1353                 }
1354 #endif
1355
1356                 VecMulf(skycol, occlusion);
1357         }
1358         else {
1359                 skycol[0]= occlusion;
1360                 skycol[1]= occlusion;
1361                 skycol[2]= occlusion;
1362         }
1363 }
1364
1365 /* ---------------------------- Caching ------------------------------- */
1366
1367 static OcclusionCacheSample *find_occ_sample(OcclusionCache *cache, int x, int y)
1368 {
1369         x -= cache->x;
1370         y -= cache->y;
1371
1372         x /= cache->step;
1373         y /= cache->step;
1374         x *= cache->step;
1375         y *= cache->step;
1376
1377         if(x < 0 || x >= cache->w || y < 0 || y >= cache->h)
1378                 return NULL;
1379         else
1380                 return &cache->sample[y*cache->w + x];
1381 }
1382
1383 static int sample_occ_cache(OcclusionTree *tree, float *co, float *n, int x, int y, int thread, float *col)
1384 {
1385         OcclusionCache *cache;
1386         OcclusionCacheSample *samples[4], *sample;
1387         float wn[4], wz[4], wb[4], tx, ty, w, totw, mino, maxo;
1388         float d[3], dist2;
1389         int i, x1, y1, x2, y2;
1390
1391         if(!tree->cache)
1392                 return 0;
1393         
1394         /* first try to find a sample in the same pixel */
1395         cache= &tree->cache[thread];
1396
1397         if(cache->sample && cache->step) {
1398                 sample= &cache->sample[(y-cache->y)*cache->w + (x-cache->x)];
1399                 if(sample->filled) {
1400                         VECSUB(d, sample->co, co);
1401                         dist2= INPR(d, d);
1402                         if(dist2 < 0.5f*sample->dist2 && INPR(sample->n, n) > 0.98f) {
1403                                 VECCOPY(col, sample->col);
1404                                 return 1;
1405                         }
1406                 }
1407         }
1408         else
1409                 return 0;
1410
1411         /* try to interpolate between 4 neighbouring pixels */
1412         samples[0]= find_occ_sample(cache, x, y);
1413         samples[1]= find_occ_sample(cache, x+cache->step, y);
1414         samples[2]= find_occ_sample(cache, x, y+cache->step);
1415         samples[3]= find_occ_sample(cache, x+cache->step, y+cache->step);
1416
1417         for(i=0; i<4; i++)
1418                 if(!samples[i] || !samples[i]->filled)
1419                         return 0;
1420
1421         /* require intensities not being too different */
1422         mino= MIN4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1423         maxo= MAX4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1424
1425         if(maxo - mino > 0.05f)
1426                 return 0;
1427
1428         /* compute weighted interpolation between samples */
1429         col[0]= col[1]= col[2]= 0.0f;
1430         totw= 0.0f;
1431
1432         x1= samples[0]->x;
1433         y1= samples[0]->y;
1434         x2= samples[3]->x;
1435         y2= samples[3]->y;
1436
1437         tx= (float)(x2 - x)/(float)(x2 - x1);
1438         ty= (float)(y2 - y)/(float)(y2 - y1);
1439
1440         wb[3]= (1.0f-tx)*(1.0f-ty);
1441         wb[2]= (tx)*(1.0f-ty);
1442         wb[1]= (1.0f-tx)*(ty);
1443         wb[0]= tx*ty;
1444
1445         for(i=0; i<4; i++) {
1446                 VECSUB(d, samples[i]->co, co);
1447                 dist2= INPR(d, d);
1448
1449                 wz[i]= 1.0f; //(samples[i]->dist2/(1e-4f + dist2));
1450                 wn[i]= pow(INPR(samples[i]->n, n), 32.0f);
1451
1452                 w= wb[i]*wn[i]*wz[i];
1453
1454                 totw += w;
1455                 col[0] += w*samples[i]->col[0];
1456                 col[1] += w*samples[i]->col[1];
1457                 col[2] += w*samples[i]->col[2];
1458         }
1459
1460         if(totw >= 0.9f) {
1461                 totw= 1.0f/totw;
1462                 col[0] *= totw;
1463                 col[1] *= totw;
1464                 col[2] *= totw;
1465                 return 1;
1466         }
1467
1468         return 0;
1469 }
1470
1471 static void sample_occ_surface(ShadeInput *shi)
1472 {
1473         StrandRen *strand= shi->strand;
1474         StrandSurface *mesh= strand->buffer->surface;
1475         int *face, *index = RE_strandren_get_face(shi->obr, strand, 0);
1476         float w[4], *co1, *co2, *co3, *co4;
1477
1478         if(mesh && mesh->face && mesh->co && mesh->col && index) {
1479                 face= mesh->face[*index];
1480
1481                 co1= mesh->co[face[0]];
1482                 co2= mesh->co[face[1]];
1483                 co3= mesh->co[face[2]];
1484                 co4= (face[3])? mesh->co[face[3]]: NULL;
1485
1486                 InterpWeightsQ3Dfl(co1, co2, co3, co4, strand->vert->co, w);
1487
1488                 shi->ao[0]= shi->ao[1]= shi->ao[2]= 0.0f;
1489                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[0]], w[0]);
1490                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[1]], w[1]);
1491                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[2]], w[2]);
1492                 if(face[3])
1493                         VECADDFAC(shi->ao, shi->ao, mesh->col[face[3]], w[3]);
1494         }
1495         else {
1496                 shi->ao[0]= 1.0f;
1497                 shi->ao[1]= 1.0f;
1498                 shi->ao[2]= 1.0f;
1499         }
1500 }
1501
1502 /* ------------------------- External Functions --------------------------- */
1503
1504 static void *exec_strandsurface_sample(void *data)
1505 {
1506         OcclusionThread *othread= (OcclusionThread*)data;
1507         Render *re= othread->re;
1508         StrandSurface *mesh= othread->mesh;
1509         float col[3], co[3], n[3], *co1, *co2, *co3, *co4;
1510         int a, *face;
1511
1512         for(a=othread->begin; a<othread->end; a++) {
1513                 face= mesh->face[a];
1514                 co1= mesh->co[face[0]];
1515                 co2= mesh->co[face[1]];
1516                 co3= mesh->co[face[2]];
1517
1518                 if(face[3]) {
1519                         co4= mesh->co[face[3]];
1520
1521                         VecLerpf(co, co1, co3, 0.5f);
1522                         CalcNormFloat4(co1, co2, co3, co4, n);
1523                 }
1524                 else {
1525                         CalcCent3f(co, co1, co2, co3);
1526                         CalcNormFloat(co1, co2, co3, n);
1527                 }
1528                 VecNegf(n);
1529
1530                 sample_occ_tree(re, re->occlusiontree, NULL, co, n, othread->thread, 0, col);
1531                 VECCOPY(othread->facecol[a], col);
1532         }
1533
1534         return 0;
1535 }
1536
1537 void make_occ_tree(Render *re)
1538 {
1539         OcclusionThread othreads[BLENDER_MAX_THREADS];
1540         StrandSurface *mesh;
1541         ListBase threads;
1542         float col[3], (*facecol)[3];
1543         int a, totface, totthread, *face, *count;
1544
1545         /* ugly, needed for occ_face */
1546         R= *re;
1547
1548         re->i.infostr= "Occlusion preprocessing";
1549         re->stats_draw(&re->i);
1550         
1551         re->occlusiontree= occ_tree_build(re);
1552         
1553         if(re->occlusiontree) {
1554                 if(re->wrld.ao_approx_passes)
1555                         occ_compute_passes(re, re->occlusiontree, re->wrld.ao_approx_passes);
1556
1557                 for(mesh=re->strandsurface.first; mesh; mesh=mesh->next) {
1558                         if(!mesh->face || !mesh->co || !mesh->col)
1559                                 continue;
1560
1561                         count= MEM_callocN(sizeof(int)*mesh->totvert, "OcclusionCount");
1562                         facecol= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceCol");
1563
1564                         totthread= (mesh->totface > 10000)? re->r.threads: 1;
1565                         totface= mesh->totface/totthread;
1566                         for(a=0; a<totthread; a++) {
1567                                 othreads[a].re= re;
1568                                 othreads[a].facecol= facecol;
1569                                 othreads[a].thread= a;
1570                                 othreads[a].mesh= mesh;
1571                                 othreads[a].begin= a*totface;
1572                                 othreads[a].end= (a == totthread-1)? mesh->totface: (a+1)*totface;
1573                         }
1574
1575                         if(totthread == 1) {
1576                                 exec_strandsurface_sample(&othreads[0]);
1577                         }
1578                         else {
1579                                 BLI_init_threads(&threads, exec_strandsurface_sample, totthread);
1580
1581                                 for(a=0; a<totthread; a++)
1582                                         BLI_insert_thread(&threads, &othreads[a]);
1583
1584                                 BLI_end_threads(&threads);
1585                         }
1586
1587                         for(a=0; a<mesh->totface; a++) {
1588                                 face= mesh->face[a];
1589
1590                                 VECCOPY(col, facecol[a]);
1591                                 VECADD(mesh->col[face[0]], mesh->col[face[0]], col);
1592                                 count[face[0]]++;
1593                                 VECADD(mesh->col[face[1]], mesh->col[face[1]], col);
1594                                 count[face[1]]++;
1595                                 VECADD(mesh->col[face[2]], mesh->col[face[2]], col);
1596                                 count[face[2]]++;
1597
1598                                 if(face[3]) {
1599                                         VECADD(mesh->col[face[3]], mesh->col[face[3]], col);
1600                                         count[face[3]]++;
1601                                 }
1602                         }
1603
1604                         for(a=0; a<mesh->totvert; a++)
1605                                 if(count[a])
1606                                         VecMulf(mesh->col[a], 1.0f/count[a]);
1607
1608                         MEM_freeN(count);
1609                         MEM_freeN(facecol);
1610                 }
1611         }
1612 }
1613
1614 void free_occ(Render *re)
1615 {
1616         if(re->occlusiontree) {
1617                 occ_free_tree(re->occlusiontree);
1618                 re->occlusiontree = NULL;
1619         }
1620 }
1621
1622 void sample_occ(Render *re, ShadeInput *shi)
1623 {
1624         OcclusionTree *tree= re->occlusiontree;
1625         OcclusionCache *cache;
1626         OcclusionCacheSample *sample;
1627         OccFace exclude;
1628         int onlyshadow;
1629
1630         if(tree) {
1631                 if(shi->strand) {
1632                         sample_occ_surface(shi);
1633                 }
1634                 /* try to get result from the cache if possible */
1635                 else if(shi->depth!=0 || !sample_occ_cache(tree, shi->co, shi->vno, shi->xs, shi->ys, shi->thread, shi->ao)) {
1636                         /* no luck, let's sample the occlusion */
1637                         exclude.obi= shi->obi - re->objectinstance;
1638                         exclude.facenr= shi->vlr->index;
1639                         onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1640                         sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1641
1642                         /* fill result into sample, each time */
1643                         if(tree->cache) {
1644                                 cache= &tree->cache[shi->thread];
1645
1646                                 if(cache->sample && cache->step) {
1647                                         sample= &cache->sample[(shi->ys-cache->y)*cache->w + (shi->xs-cache->x)];
1648                                         VECCOPY(sample->co, shi->co);
1649                                         VECCOPY(sample->n, shi->vno);
1650                                         VECCOPY(sample->col, shi->ao);
1651                                         sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1652                                         sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1653                                         sample->filled= 1;
1654                                 }
1655                         }
1656                 }
1657         }
1658         else {
1659                 shi->ao[0]= 1.0f;
1660                 shi->ao[1]= 1.0f;
1661                 shi->ao[2]= 1.0f;
1662         }
1663 }
1664
1665 void cache_occ_samples(Render *re, RenderPart *pa, ShadeSample *ssamp)
1666 {
1667         OcclusionTree *tree= re->occlusiontree;
1668         PixStr ps;
1669         OcclusionCache *cache;
1670         OcclusionCacheSample *sample;
1671         OccFace exclude;
1672         ShadeInput *shi;
1673         intptr_t *rd=NULL;
1674         int *ro=NULL, *rp=NULL, *rz=NULL, onlyshadow;
1675         int x, y, step = CACHE_STEP;
1676
1677         if(!tree->cache)
1678                 return;
1679
1680         cache= &tree->cache[pa->thread];
1681         cache->w= pa->rectx;
1682         cache->h= pa->recty;
1683         cache->x= pa->disprect.xmin;
1684         cache->y= pa->disprect.ymin;
1685         cache->step= step;
1686         cache->sample= MEM_callocN(sizeof(OcclusionCacheSample)*cache->w*cache->h, "OcclusionCacheSample");
1687         sample= cache->sample;
1688
1689         if(re->osa) {
1690                 rd= pa->rectdaps;
1691         }
1692         else {
1693                 /* fake pixel struct for non-osa */
1694                 ps.next= NULL;
1695                 ps.mask= 0xFFFF;
1696
1697                 ro= pa->recto;
1698                 rp= pa->rectp;
1699                 rz= pa->rectz;
1700         }
1701
1702         /* compute a sample at every step pixels */
1703         for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++) {
1704                 for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, sample++, rd++, ro++, rp++, rz++) {
1705                         if(!(((x - pa->disprect.xmin + step) % step) == 0 || x == pa->disprect.xmax-1))
1706                                 continue;
1707                         if(!(((y - pa->disprect.ymin + step) % step) == 0 || y == pa->disprect.ymax-1))
1708                                 continue;
1709
1710                         if(re->osa) {
1711                                 if(!*rd) continue;
1712
1713                                 shade_samples_fill_with_ps(ssamp, (PixStr *)(*rd), x, y);
1714                         }
1715                         else {
1716                                 if(!*rp) continue;
1717
1718                                 ps.obi= *ro;
1719                                 ps.facenr= *rp;
1720                                 ps.z= *rz;
1721                                 shade_samples_fill_with_ps(ssamp, &ps, x, y);
1722                         }
1723
1724                         shi= ssamp->shi;
1725                         if(shi->vlr) {
1726                                 onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1727                                 exclude.obi= shi->obi - re->objectinstance;
1728                                 exclude.facenr= shi->vlr->index;
1729                                 sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1730
1731                                 VECCOPY(sample->co, shi->co);
1732                                 VECCOPY(sample->n, shi->vno);
1733                                 VECCOPY(sample->col, shi->ao);
1734                                 sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1735                                 sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1736                                 sample->x= shi->xs;
1737                                 sample->y= shi->ys;
1738                                 sample->filled= 1;
1739                         }
1740
1741                         if(re->test_break())
1742                                 break;
1743                 }
1744         }
1745 }
1746
1747 void free_occ_samples(Render *re, RenderPart *pa)
1748 {
1749         OcclusionTree *tree= re->occlusiontree;
1750         OcclusionCache *cache;
1751
1752         if(tree->cache) {
1753                 cache= &tree->cache[pa->thread];
1754
1755                 if(cache->sample)
1756                         MEM_freeN(cache->sample);
1757
1758                 cache->w= 0;
1759                 cache->h= 0;
1760                 cache->step= 0;
1761         }
1762 }
1763