* First commit merging 2.4-based sim_physics in to volume25 branch.
[blender-staging.git] / source / blender / render / intern / source / occlusion.c
1 /* 
2  * $Id$
3  *
4  * ***** BEGIN GPL LICENSE BLOCK *****
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software Foundation,
18  * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19  *
20  * The Original Code is Copyright (C) 2008 Blender Foundation.
21  * All rights reserved.
22  *
23  * The Original Code is: all of this file.
24  *
25  * Contributor(s): Brecht Van Lommel.
26  *
27  * ***** END GPL LICENSE BLOCK *****
28  */
29
30 #include <math.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "MEM_guardedalloc.h"
36
37 #include "DNA_material_types.h"
38
39 #include "BLI_arithb.h"
40 #include "BLI_blenlib.h"
41 #include "BLI_memarena.h"
42 #include "BLI_threads.h"
43
44 #include "BKE_global.h"
45 #include "BKE_scene.h"
46 #include "BKE_utildefines.h"
47
48 #include "RE_shader_ext.h"
49
50 /* local includes */
51 #include "occlusion.h"
52 #include "render_types.h"
53 #include "rendercore.h"
54 #include "renderdatabase.h"
55 #include "pixelshading.h"
56 #include "shading.h"
57 #include "zbuf.h"
58
59 /* ------------------------- Declarations --------------------------- */
60
61 #define INVALID_INDEX ((int)(~0))
62 #define INVPI 0.31830988618379069f
63 #define TOTCHILD 8
64 #define CACHE_STEP 3
65
66 typedef struct OcclusionCacheSample {
67         float co[3], n[3], col[3], intensity, dist2;
68         int x, y, filled;
69 } OcclusionCacheSample;
70
71 typedef struct OcclusionCache {
72         OcclusionCacheSample *sample;
73         int x, y, w, h, step;
74 } OcclusionCache;
75
76 typedef struct OccFace {
77         int obi;
78         int facenr;
79 } OccFace;
80
81 typedef struct OccNode {
82         float co[3], area;
83         float sh[9], dco;
84         float occlusion;
85         int childflag;
86         union {
87                 //OccFace face;
88                 int face;
89                 struct OccNode *node;
90         } child[TOTCHILD];
91 } OccNode;
92
93 typedef struct OcclusionTree {
94         MemArena *arena;
95
96         float (*co)[3];         /* temporary during build */
97
98         OccFace *face;          /* instance and face indices */
99         float *occlusion;       /* occlusion for faces */
100         
101         OccNode *root;
102
103         OccNode **stack[BLENDER_MAX_THREADS];
104         int maxdepth;
105
106         int totface;
107
108         float error;
109         float distfac;
110
111         int dothreadedbuild;
112         int totbuildthread;
113
114         OcclusionCache *cache;
115 } OcclusionTree;
116
117 typedef struct OcclusionThread {
118         Render *re;
119         StrandSurface *mesh;
120         float (*facecol)[3];
121         int begin, end;
122         int thread;
123 } OcclusionThread;
124
125 typedef struct OcclusionBuildThread {
126         OcclusionTree *tree;
127         int begin, end, depth;
128         OccNode *node;
129 } OcclusionBuildThread;
130
131 /* ------------------------- Shading --------------------------- */
132
133 extern Render R; // meh
134
135 #if 0
136 static void occ_shade(ShadeSample *ssamp, ObjectInstanceRen *obi, VlakRen *vlr, float *rad)
137 {
138         ShadeInput *shi= ssamp->shi;
139         ShadeResult *shr= ssamp->shr;
140         float l, u, v, *v1, *v2, *v3;
141         
142         /* init */
143         if(vlr->v4) {
144                 shi->u= u= 0.5f;
145                 shi->v= v= 0.5f;
146         }
147         else {
148                 shi->u= u= 1.0f/3.0f;
149                 shi->v= v= 1.0f/3.0f;
150         }
151
152         /* setup render coordinates */
153         v1= vlr->v1->co;
154         v2= vlr->v2->co;
155         v3= vlr->v3->co;
156         
157         /* renderco */
158         l= 1.0f-u-v;
159         
160         shi->co[0]= l*v3[0]+u*v1[0]+v*v2[0];
161         shi->co[1]= l*v3[1]+u*v1[1]+v*v2[1];
162         shi->co[2]= l*v3[2]+u*v1[2]+v*v2[2];
163         
164         shade_input_set_triangle_i(shi, obi, vlr, 0, 1, 2);
165
166         /* set up view vector */
167         VECCOPY(shi->view, shi->co);
168         Normalize(shi->view);
169         
170         /* cache for shadow */
171         shi->samplenr++;
172         
173         shi->xs= 0; // TODO
174         shi->ys= 0;
175         
176         shade_input_set_normals(shi);
177
178         /* no normal flip */
179         if(shi->flippednor)
180                 shade_input_flip_normals(shi);
181
182         /* not a pretty solution, but fixes common cases */
183         if(shi->obr->ob && shi->obr->ob->transflag & OB_NEG_SCALE) {
184                 VecNegf(shi->vn);
185                 VecNegf(shi->vno);
186         }
187
188         /* init material vars */
189         // note, keep this synced with render_types.h
190         memcpy(&shi->r, &shi->mat->r, 23*sizeof(float));
191         shi->har= shi->mat->har;
192         
193         /* render */
194         shade_input_set_shade_texco(shi);
195         shade_material_loop(shi, shr); /* todo: nodes */
196         
197         VECCOPY(rad, shr->combined);
198 }
199
200 static void occ_build_shade(Render *re, OcclusionTree *tree)
201 {
202         ShadeSample ssamp;
203         ObjectInstanceRen *obi;
204         VlakRen *vlr;
205         int a;
206
207         R= *re;
208
209         /* setup shade sample with correct passes */
210         memset(&ssamp, 0, sizeof(ShadeSample));
211         ssamp.shi[0].lay= re->scene->lay;
212         ssamp.shi[0].passflag= SCE_PASS_DIFFUSE|SCE_PASS_RGBA;
213         ssamp.shi[0].combinedflag= ~(SCE_PASS_SPEC);
214         ssamp.tot= 1;
215
216         for(a=0; a<tree->totface; a++) {
217                 obi= &R.objectinstance[tree->face[a].obi];
218                 vlr= RE_findOrAddVlak(obi->obr, tree->face[a].vlr);
219
220                 occ_shade(&ssamp, obi, vlr, tree->rad[a]);
221         }
222 }
223 #endif
224
225 /* ------------------------- Spherical Harmonics --------------------------- */
226
227 /* Use 2nd order SH => 9 coefficients, stored in this order:
228    0 = (0,0),
229    1 = (1,-1), 2 = (1,0), 3 = (1,1),
230    4 = (2,-2), 5 = (2,-1), 6 = (2,0), 7 = (2,1), 8 = (2,2) */
231
232 static void sh_copy(float *shresult, float *sh)
233 {
234         memcpy(shresult, sh, sizeof(float)*9);
235 }
236
237 static void sh_mul(float *sh, float f)
238 {
239         int i;
240
241         for(i=0; i<9; i++)
242                 sh[i] *= f;
243 }
244
245 static void sh_add(float *shresult, float *sh1, float *sh2)
246 {
247         int i;
248
249         for(i=0; i<9; i++)
250                 shresult[i]= sh1[i] + sh2[i];
251 }
252
253 static void sh_from_disc(float *n, float area, float *shresult)
254 {
255         /* See formula (3) in:
256            "An Efficient Representation for Irradiance Environment Maps" */
257         float sh[9], x, y, z;
258
259         x= n[0];
260         y= n[1];
261         z= n[2];
262
263         sh[0]= 0.282095f;
264
265         sh[1]= 0.488603f*y;
266         sh[2]= 0.488603f*z;
267         sh[3]= 0.488603f*x;
268         
269         sh[4]= 1.092548f*x*y;
270         sh[5]= 1.092548f*y*z;
271         sh[6]= 0.315392f*(3.0f*z*z - 1.0f);
272         sh[7]= 1.092548f*x*z;
273         sh[8]= 0.546274f*(x*x - y*y);
274
275         sh_mul(sh, area);
276         sh_copy(shresult, sh);
277 }
278
279 static float sh_eval(float *sh, float *v)
280 {
281         /* See formula (13) in:
282            "An Efficient Representation for Irradiance Environment Maps" */
283         static const float c1 = 0.429043f, c2 = 0.511664f, c3 = 0.743125f;
284         static const float c4 = 0.886227f, c5 = 0.247708f;
285         float x, y, z, sum;
286
287         x= v[0];
288         y= v[1];
289         z= v[2];
290
291         sum= c1*sh[8]*(x*x - y*y);
292         sum += c3*sh[6]*z*z;
293         sum += c4*sh[0];
294         sum += -c5*sh[6];
295         sum += 2.0f*c1*(sh[4]*x*y + sh[7]*x*z + sh[5]*y*z);
296         sum += 2.0f*c2*(sh[3]*x + sh[1]*y + sh[2]*z);
297
298         return sum;
299 }
300
301 /* ------------------------------ Building --------------------------------- */
302
303 static void occ_face(const OccFace *face, float *co, float *normal, float *area)
304 {
305         ObjectInstanceRen *obi;
306         VlakRen *vlr;
307         float v1[3], v2[3], v3[3], v4[3];
308
309         obi= &R.objectinstance[face->obi];
310         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
311         
312         if(co) {
313                 if(vlr->v4)
314                         VecLerpf(co, vlr->v1->co, vlr->v3->co, 0.5f);
315                 else
316                         CalcCent3f(co, vlr->v1->co, vlr->v2->co, vlr->v3->co);
317
318                 if(obi->flag & R_TRANSFORMED)
319                         Mat4MulVecfl(obi->mat, co);
320         }
321         
322         if(normal) {
323                 normal[0]= -vlr->n[0];
324                 normal[1]= -vlr->n[1];
325                 normal[2]= -vlr->n[2];
326
327                 if(obi->flag & R_TRANSFORMED)
328                         Mat3MulVecfl(obi->nmat, normal);
329         }
330
331         if(area) {
332                 VECCOPY(v1, vlr->v1->co);
333                 VECCOPY(v2, vlr->v2->co);
334                 VECCOPY(v3, vlr->v3->co);
335                 if(vlr->v4) VECCOPY(v4, vlr->v4->co);
336
337                 if(obi->flag & R_TRANSFORMED) {
338                         Mat4MulVecfl(obi->mat, v1);
339                         Mat4MulVecfl(obi->mat, v2);
340                         Mat4MulVecfl(obi->mat, v3);
341                         if(vlr->v4) Mat4MulVecfl(obi->mat, v4);
342                 }
343
344                 /* todo: correct area for instances */
345                 if(vlr->v4)
346                         *area= AreaQ3Dfl(v1, v2, v3, v4);
347                 else
348                         *area= AreaT3Dfl(v1, v2, v3);
349         }
350 }
351
352 static void occ_sum_occlusion(OcclusionTree *tree, OccNode *node)
353 {
354         OccNode *child;
355         float occ, area, totarea;
356         int a, b;
357
358         occ= 0.0f;
359         totarea= 0.0f;
360
361         for(b=0; b<TOTCHILD; b++) {
362                 if(node->childflag & (1<<b)) {
363                         a= node->child[b].face;
364                         occ_face(&tree->face[a], 0, 0, &area);
365                         occ += area*tree->occlusion[a];
366                         totarea += area;
367                 }
368                 else if(node->child[b].node) {
369                         child= node->child[b].node;
370                         occ_sum_occlusion(tree, child);
371
372                         occ += child->area*child->occlusion;
373                         totarea += child->area;
374                 }
375         }
376
377         if(totarea != 0.0f)
378                 occ /= totarea;
379         
380         node->occlusion= occ;
381 }
382
383 static int occ_find_bbox_axis(OcclusionTree *tree, int begin, int end, float *min, float *max)
384 {
385         float len, maxlen= -1.0f;
386         int a, axis = 0;
387
388         INIT_MINMAX(min, max);
389
390         for(a=begin; a<end; a++)
391                 DO_MINMAX(tree->co[a], min, max)
392
393         for(a=0; a<3; a++) {
394                 len= max[a] - min[a];
395
396                 if(len > maxlen) {
397                         maxlen= len;
398                         axis= a;
399                 }
400         }
401
402         return axis;
403 }
404
405 static void occ_node_from_face(OccFace *face, OccNode *node)
406 {
407         float n[3];
408
409         occ_face(face, node->co, n, &node->area);
410         node->dco= 0.0f;
411         sh_from_disc(n, node->area, node->sh);
412 }
413
414 static void occ_build_dco(OcclusionTree *tree, OccNode *node, float *co, float *dco)
415 {
416         OccNode *child;
417         float dist, d[3], nco[3];
418         int b;
419
420         for(b=0; b<TOTCHILD; b++) {
421                 if(node->childflag & (1<<b)) {
422                         occ_face(tree->face+node->child[b].face, nco, 0, 0);
423                 }
424                 else if(node->child[b].node) {
425                         child= node->child[b].node;
426                         occ_build_dco(tree, child, co, dco);
427                         VECCOPY(nco, child->co);
428                 }
429
430                 VECSUB(d, nco, co);
431                 dist= INPR(d, d);
432                 if(dist > *dco)
433                         *dco= dist;
434         }
435 }
436
437 static void occ_build_split(OcclusionTree *tree, int begin, int end, int *split)
438 {
439         float min[3], max[3], mid;
440         int axis, a, enda;
441
442         /* split in middle of boundbox. this seems faster than median split
443          * on complex scenes, possibly since it avoids two distant faces to
444          * be in the same node better? */
445         axis= occ_find_bbox_axis(tree, begin, end, min, max);
446         mid= 0.5f*(min[axis]+max[axis]);
447
448         a= begin;
449         enda= end;
450         while(a<enda) {
451                 if(tree->co[a][axis] > mid) {
452                         enda--;
453                         SWAP(OccFace, tree->face[a], tree->face[enda]);
454                         SWAP(float, tree->co[a][0], tree->co[enda][0]);
455                         SWAP(float, tree->co[a][1], tree->co[enda][1]);
456                         SWAP(float, tree->co[a][2], tree->co[enda][2]);
457                 }
458                 else
459                         a++;
460         }
461
462         *split= enda;
463 }
464
465 static void occ_build_8_split(OcclusionTree *tree, int begin, int end, int *offset, int *count)
466 {
467         /* split faces into eight groups */
468         int b, splitx, splity[2], splitz[4];
469
470         occ_build_split(tree, begin, end, &splitx);
471
472         /* force split if none found, to deal with degenerate geometry */
473         if(splitx == begin || splitx == end)
474                 splitx= (begin+end)/2;
475
476         occ_build_split(tree, begin, splitx, &splity[0]);
477         occ_build_split(tree, splitx, end, &splity[1]);
478
479         occ_build_split(tree, begin, splity[0], &splitz[0]);
480         occ_build_split(tree, splity[0], splitx, &splitz[1]);
481         occ_build_split(tree, splitx, splity[1], &splitz[2]);
482         occ_build_split(tree, splity[1], end, &splitz[3]);
483
484         offset[0]= begin;
485         offset[1]= splitz[0];
486         offset[2]= splity[0];
487         offset[3]= splitz[1];
488         offset[4]= splitx;
489         offset[5]= splitz[2];
490         offset[6]= splity[1];
491         offset[7]= splitz[3];
492
493         for(b=0; b<7; b++)
494                 count[b]= offset[b+1] - offset[b];
495         count[7]= end - offset[7];
496 }
497
498 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth);
499
500 static void *exec_occ_build(void *data)
501 {
502         OcclusionBuildThread *othread= (OcclusionBuildThread*)data;
503
504         occ_build_recursive(othread->tree, othread->node, othread->begin, othread->end, othread->depth);
505
506         return 0;
507 }
508
509 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth)
510 {
511         ListBase threads;
512         OcclusionBuildThread othreads[BLENDER_MAX_THREADS];
513         OccNode *child, tmpnode;
514         OccFace *face;
515         int a, b, totthread=0, offset[TOTCHILD], count[TOTCHILD];
516
517         /* add a new node */
518         node->occlusion= 1.0f;
519
520         /* leaf node with only children */
521         if(end - begin <= TOTCHILD) {
522                 for(a=begin, b=0; a<end; a++, b++) {
523                         face= &tree->face[a];
524                         node->child[b].face= a;
525                         node->childflag |= (1<<b);
526                 }
527         }
528         else {
529                 /* order faces */
530                 occ_build_8_split(tree, begin, end, offset, count);
531
532                 if(depth == 1 && tree->dothreadedbuild)
533                         BLI_init_threads(&threads, exec_occ_build, tree->totbuildthread);
534
535                 for(b=0; b<TOTCHILD; b++) {
536                         if(count[b] == 0) {
537                                 node->child[b].node= NULL;
538                         }
539                         else if(count[b] == 1) {
540                                 face= &tree->face[offset[b]];
541                                 node->child[b].face= offset[b];
542                                 node->childflag |= (1<<b);
543                         }
544                         else {
545                                 if(tree->dothreadedbuild)
546                                         BLI_lock_thread(LOCK_CUSTOM1);
547
548                                 child= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
549                                 node->child[b].node= child;
550
551                                 /* keep track of maximum depth for stack */
552                                 if(depth+1 > tree->maxdepth)
553                                         tree->maxdepth= depth+1;
554
555                                 if(tree->dothreadedbuild)
556                                         BLI_unlock_thread(LOCK_CUSTOM1);
557
558                                 if(depth == 1 && tree->dothreadedbuild) {
559                                         othreads[totthread].tree= tree;
560                                         othreads[totthread].node= child;
561                                         othreads[totthread].begin= offset[b];
562                                         othreads[totthread].end= offset[b]+count[b];
563                                         othreads[totthread].depth= depth+1;
564                                         BLI_insert_thread(&threads, &othreads[totthread]);
565                                         totthread++;
566                                 }
567                                 else
568                                         occ_build_recursive(tree, child, offset[b], offset[b]+count[b], depth+1);
569                         }
570                 }
571
572                 if(depth == 1 && tree->dothreadedbuild)
573                         BLI_end_threads(&threads);
574         }
575
576         /* combine area, position and sh */
577         for(b=0; b<TOTCHILD; b++) {
578                 if(node->childflag & (1<<b)) {
579                         child= &tmpnode;
580                         occ_node_from_face(tree->face+node->child[b].face, &tmpnode);
581                 }
582                 else {
583                         child= node->child[b].node;
584                 }
585
586                 if(child) {
587                         node->area += child->area;
588                         sh_add(node->sh, node->sh, child->sh);
589                         VECADDFAC(node->co, node->co, child->co, child->area);
590                 }
591         }
592
593         if(node->area != 0.0f)
594                 VecMulf(node->co, 1.0f/node->area);
595
596         /* compute maximum distance from center */
597         node->dco= 0.0f;
598         occ_build_dco(tree, node, node->co, &node->dco);
599 }
600
601 static void occ_build_sh_normalize(OccNode *node)
602 {
603         /* normalize spherical harmonics to not include area, so
604          * we can clamp the dot product and then mutliply by area */
605         int b;
606
607         if(node->area != 0.0f)
608                 sh_mul(node->sh, 1.0f/node->area);
609
610         for(b=0; b<TOTCHILD; b++) {
611                 if(node->childflag & (1<<b));
612                 else if(node->child[b].node)
613                         occ_build_sh_normalize(node->child[b].node);
614         }
615 }
616
617 static OcclusionTree *occ_tree_build(Render *re)
618 {
619         OcclusionTree *tree;
620         ObjectInstanceRen *obi;
621         ObjectRen *obr;
622         VlakRen *vlr= NULL;
623         int a, b, c, totface;
624
625         /* count */
626         totface= 0;
627         for(obi=re->instancetable.first; obi; obi=obi->next) {
628                 obr= obi->obr;
629                 for(a=0; a<obr->totvlak; a++) {
630                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
631                         else vlr++;
632
633                         if((vlr->mat->mode & MA_TRACEBLE) && (vlr->mat->material_type == MA_TYPE_SURFACE))
634                                 totface++;
635                 }
636         }
637
638         if(totface == 0)
639                 return NULL;
640         
641         tree= MEM_callocN(sizeof(OcclusionTree), "OcclusionTree");
642         tree->totface= totface;
643
644         /* parameters */
645         tree->error= get_render_aosss_error(&re->r, re->wrld.ao_approx_error);
646         tree->distfac= (re->wrld.aomode & WO_AODIST)? re->wrld.aodistfac: 0.0f;
647
648         /* allocation */
649         tree->arena= BLI_memarena_new(0x8000 * sizeof(OccNode));
650         BLI_memarena_use_calloc(tree->arena);
651
652         if(re->wrld.aomode & WO_AOCACHE)
653                 tree->cache= MEM_callocN(sizeof(OcclusionCache)*BLENDER_MAX_THREADS, "OcclusionCache");
654
655         tree->face= MEM_callocN(sizeof(OccFace)*totface, "OcclusionFace");
656         tree->co= MEM_callocN(sizeof(float)*3*totface, "OcclusionCo");
657         tree->occlusion= MEM_callocN(sizeof(float)*totface, "OcclusionOcclusion");
658
659         /* make array of face pointers */
660         for(b=0, c=0, obi=re->instancetable.first; obi; obi=obi->next, c++) {
661                 obr= obi->obr;
662                 for(a=0; a<obr->totvlak; a++) {
663                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
664                         else vlr++;
665
666                         if(vlr->mat->mode & MA_TRACEBLE) {
667                                 tree->face[b].obi= c;
668                                 tree->face[b].facenr= a;
669                                 tree->occlusion[b]= 1.0f;
670                                 occ_face(&tree->face[b], tree->co[b], NULL, NULL); 
671                                 b++;
672                         }
673                 }
674         }
675
676         /* threads */
677         tree->totbuildthread= (re->r.threads > 1 && totface > 10000)? 8: 1;
678         tree->dothreadedbuild= (tree->totbuildthread > 1);
679
680         /* recurse */
681         tree->root= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
682         tree->maxdepth= 1;
683         occ_build_recursive(tree, tree->root, 0, totface, 1);
684
685 #if 0
686         if(tree->doindirect) {
687                 occ_build_shade(re, tree);
688                 occ_sum_occlusion(tree, tree->root);
689         }
690 #endif
691         
692         MEM_freeN(tree->co);
693         tree->co= NULL;
694
695         occ_build_sh_normalize(tree->root);
696
697         for(a=0; a<BLENDER_MAX_THREADS; a++)
698                 tree->stack[a]= MEM_callocN(sizeof(OccNode)*TOTCHILD*(tree->maxdepth+1), "OccStack");
699
700         return tree;
701 }
702
703 static void occ_free_tree(OcclusionTree *tree)
704 {
705         int a;
706
707         if(tree) {
708                 if(tree->arena) BLI_memarena_free(tree->arena);
709                 for(a=0; a<BLENDER_MAX_THREADS; a++)
710                         if(tree->stack[a])
711                                 MEM_freeN(tree->stack[a]);
712                 if(tree->occlusion) MEM_freeN(tree->occlusion);
713                 if(tree->face) MEM_freeN(tree->face);
714                 if(tree->cache) MEM_freeN(tree->cache);
715                 MEM_freeN(tree);
716         }
717 }
718
719 /* ------------------------- Traversal --------------------------- */
720
721 static float occ_solid_angle(OccNode *node, float *v, float d2, float invd2, float *receivenormal)
722 {
723         float dotreceive, dotemit;
724         float ev[3];
725
726         ev[0]= -v[0]*invd2;
727         ev[1]= -v[1]*invd2;
728         ev[2]= -v[2]*invd2;
729         dotemit= sh_eval(node->sh, ev);
730         dotreceive= INPR(receivenormal, v)*invd2;
731
732         CLAMP(dotemit, 0.0f, 1.0f);
733         CLAMP(dotreceive, 0.0f, 1.0f);
734         
735         return ((node->area*dotemit*dotreceive)/(d2 + node->area*INVPI))*INVPI;
736 }
737
738 static void VecAddDir(float *result, float *v1, float *v2, float fac)
739 {
740         result[0]= v1[0] + fac*(v2[0] - v1[0]);
741         result[1]= v1[1] + fac*(v2[1] - v1[1]);
742         result[2]= v1[2] + fac*(v2[2] - v1[2]);
743 }
744
745 static int occ_visible_quad(float *p, float *n, float *v0, float *v1, float *v2, float *q0, float *q1, float *q2, float *q3)
746 {
747         static const float epsilon = 1e-6f;
748         float c, sd[3];
749         
750         c= INPR(n, p);
751
752         /* signed distances from the vertices to the plane. */
753         sd[0]= INPR(n, v0) - c;
754         sd[1]= INPR(n, v1) - c;
755         sd[2]= INPR(n, v2) - c;
756
757         if(fabsf(sd[0]) < epsilon) sd[0] = 0.0f;
758         if(fabsf(sd[1]) < epsilon) sd[1] = 0.0f;
759         if(fabsf(sd[2]) < epsilon) sd[2] = 0.0f;
760
761         if(sd[0] > 0) {
762                 if(sd[1] > 0) {
763                         if(sd[2] > 0) {
764                                 // +++
765                                 VECCOPY(q0, v0);
766                                 VECCOPY(q1, v1);
767                                 VECCOPY(q2, v2);
768                                 VECCOPY(q3, q2);
769                         }
770                         else if(sd[2] < 0) {
771                                 // ++-
772                                 VECCOPY(q0, v0);
773                                 VECCOPY(q1, v1);
774                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
775                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
776                         }
777                         else {
778                                 // ++0
779                                 VECCOPY(q0, v0);
780                                 VECCOPY(q1, v1);
781                                 VECCOPY(q2, v2);
782                                 VECCOPY(q3, q2);
783                         }
784                 }
785                 else if(sd[1] < 0) {
786                         if(sd[2] > 0) {
787                                 // +-+
788                                 VECCOPY(q0, v0);
789                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
790                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
791                                 VECCOPY(q3, v2);
792                         }
793                         else if(sd[2] < 0) {
794                                 // +--
795                                 VECCOPY(q0, v0);
796                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
797                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
798                                 VECCOPY(q3, q2);
799                         }
800                         else {
801                                 // +-0
802                                 VECCOPY(q0, v0);
803                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
804                                 VECCOPY(q2, v2);
805                                 VECCOPY(q3, q2);
806                         }
807                 }
808                 else {
809                         if(sd[2] > 0) {
810                                 // +0+
811                                 VECCOPY(q0, v0);
812                                 VECCOPY(q1, v1);
813                                 VECCOPY(q2, v2);
814                                 VECCOPY(q3, q2);
815                         }
816                         else if(sd[2] < 0) {
817                                 // +0-
818                                 VECCOPY(q0, v0);
819                                 VECCOPY(q1, v1);
820                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
821                                 VECCOPY(q3, q2);
822                         }
823                         else {
824                                 // +00
825                                 VECCOPY(q0, v0);
826                                 VECCOPY(q1, v1);
827                                 VECCOPY(q2, v2);
828                                 VECCOPY(q3, q2);
829                         }
830                 }
831         }
832         else if(sd[0] < 0) {
833                 if(sd[1] > 0) {
834                         if(sd[2] > 0) {
835                                 // -++
836                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
837                                 VECCOPY(q1, v1);
838                                 VECCOPY(q2, v2);
839                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
840                         }
841                         else if(sd[2] < 0) {
842                                 // -+-
843                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
844                                 VECCOPY(q1, v1);
845                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
846                                 VECCOPY(q3, q2);
847                         }
848                         else {
849                                 // -+0
850                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
851                                 VECCOPY(q1, v1);
852                                 VECCOPY(q2, v2);
853                                 VECCOPY(q3, q2);
854                         }
855                 }
856                 else if(sd[1] < 0) {
857                         if(sd[2] > 0) {
858                                 // --+
859                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
860                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
861                                 VECCOPY(q2, v2);
862                                 VECCOPY(q3, q2);
863                         }
864                         else if(sd[2] < 0) {
865                                 // ---
866                                 return 0;
867                         }
868                         else {
869                                 // --0
870                                 return 0;
871                         }
872                 }
873                 else {
874                         if(sd[2] > 0) {
875                                 // -0+
876                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
877                                 VECCOPY(q1, v1);
878                                 VECCOPY(q2, v2);
879                                 VECCOPY(q3, q2);
880                         }
881                         else if(sd[2] < 0) {
882                                 // -0-
883                                 return 0;
884                         }
885                         else {
886                                 // -00
887                                 return 0;
888                         }
889                 }
890         }
891         else {
892                 if(sd[1] > 0) {
893                         if(sd[2] > 0) {
894                                 // 0++
895                                 VECCOPY(q0, v0);
896                                 VECCOPY(q1, v1);
897                                 VECCOPY(q2, v2);
898                                 VECCOPY(q3, q2);
899                         }
900                         else if(sd[2] < 0) {
901                                 // 0+-
902                                 VECCOPY(q0, v0);
903                                 VECCOPY(q1, v1);
904                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
905                                 VECCOPY(q3, q2);
906                         }
907                         else {
908                                 // 0+0
909                                 VECCOPY(q0, v0);
910                                 VECCOPY(q1, v1);
911                                 VECCOPY(q2, v2);
912                                 VECCOPY(q3, q2);
913                         }
914                 }
915                 else if(sd[1] < 0) {
916                         if(sd[2] > 0) {
917                                 // 0-+
918                                 VECCOPY(q0, v0);
919                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
920                                 VECCOPY(q2, v2);
921                                 VECCOPY(q3, q2);
922                         }
923                         else if(sd[2] < 0) {
924                                 // 0--
925                                 return 0;
926                         }
927                         else {
928                                 // 0-0
929                                 return 0;
930                         }
931                 }
932                 else {
933                         if(sd[2] > 0) {
934                                 // 00+
935                                 VECCOPY(q0, v0);
936                                 VECCOPY(q1, v1);
937                                 VECCOPY(q2, v2);
938                                 VECCOPY(q3, q2);
939                         }
940                         else if(sd[2] < 0) {
941                                 // 00-
942                                 return 0;
943                         }
944                         else {
945                                 // 000
946                                 return 0;
947                         }
948                 }
949         }
950
951         return 1;
952 }
953
954 /* altivec optimization, this works, but is unused */
955
956 #if 0
957 #include <Accelerate/Accelerate.h>
958
959 typedef union {
960         vFloat v;
961         float f[4];
962 } vFloatResult;
963
964 static vFloat vec_splat_float(float val)
965 {
966         return (vFloat){val, val, val, val};
967 }
968
969 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
970 {
971         vFloat vcos, rlen, vrx, vry, vrz, vsrx, vsry, vsrz, gx, gy, gz, vangle;
972         vUInt8 rotate = (vUInt8){4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3};
973         vFloatResult vresult;
974         float result;
975
976         /* compute r* */
977         vrx = (vFloat){q0[0], q1[0], q2[0], q3[0]} - vec_splat_float(p[0]);
978         vry = (vFloat){q0[1], q1[1], q2[1], q3[1]} - vec_splat_float(p[1]);
979         vrz = (vFloat){q0[2], q1[2], q2[2], q3[2]} - vec_splat_float(p[2]);
980
981         /* normalize r* */
982         rlen = vec_rsqrte(vrx*vrx + vry*vry + vrz*vrz + vec_splat_float(1e-16f));
983         vrx = vrx*rlen;
984         vry = vry*rlen;
985         vrz = vrz*rlen;
986
987         /* rotate r* for cross and dot */
988         vsrx= vec_perm(vrx, vrx, rotate);
989         vsry= vec_perm(vry, vry, rotate);
990         vsrz= vec_perm(vrz, vrz, rotate);
991
992         /* cross product */
993         gx = vsry*vrz - vsrz*vry;
994         gy = vsrz*vrx - vsrx*vrz;
995         gz = vsrx*vry - vsry*vrx;
996
997         /* normalize */
998         rlen = vec_rsqrte(gx*gx + gy*gy + gz*gz + vec_splat_float(1e-16f));
999         gx = gx*rlen;
1000         gy = gy*rlen;
1001         gz = gz*rlen;
1002
1003         /* angle */
1004         vcos = vrx*vsrx + vry*vsry + vrz*vsrz;
1005         vcos= vec_max(vec_min(vcos, vec_splat_float(1.0f)), vec_splat_float(-1.0f));
1006         vangle= vacosf(vcos);
1007
1008         /* dot */
1009         vresult.v = (vec_splat_float(n[0])*gx +
1010                      vec_splat_float(n[1])*gy +
1011                      vec_splat_float(n[2])*gz)*vangle;
1012
1013         result= (vresult.f[0] + vresult.f[1] + vresult.f[2] + vresult.f[3])*(0.5f/(float)M_PI);
1014         result= MAX2(result, 0.0f);
1015
1016         return result;
1017 }
1018
1019 #endif
1020
1021 /* SSE optimization, acos code doesn't work */
1022
1023 #if 0
1024
1025 #include <xmmintrin.h>
1026
1027 static __m128 sse_approx_acos(__m128 x)
1028 {
1029         /* needs a better approximation than taylor expansion of acos, since that
1030          * gives big erros for near 1.0 values, sqrt(2*x)*acos(1-x) should work
1031          * better, see http://www.tom.womack.net/projects/sse-fast-arctrig.html */
1032
1033         return _mm_set_ps1(1.0f);
1034 }
1035
1036 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1037 {
1038         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1039         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1040         float fresult[4] __attribute__((aligned(16)));
1041         __m128 qx, qy, qz, rx, ry, rz, rlen, srx, sry, srz, gx, gy, gz, glen, rcos, angle, aresult;
1042
1043         /* compute r */
1044         qx = _mm_set_ps(q3[0], q2[0], q1[0], q0[0]);
1045         qy = _mm_set_ps(q3[1], q2[1], q1[1], q0[1]);
1046         qz = _mm_set_ps(q3[2], q2[2], q1[2], q0[2]);
1047
1048         rx = qx - _mm_set_ps1(p[0]);
1049         ry = qy - _mm_set_ps1(p[1]);
1050         rz = qz - _mm_set_ps1(p[2]);
1051
1052         /* normalize r */
1053         rlen = _mm_rsqrt_ps(rx*rx + ry*ry + rz*rz + _mm_set_ps1(1e-16f));
1054         rx = rx*rlen;
1055         ry = ry*rlen;
1056         rz = rz*rlen;
1057
1058         /* cross product */
1059         srx = _mm_shuffle_ps(rx, rx, _MM_SHUFFLE(0,3,2,1));
1060         sry = _mm_shuffle_ps(ry, ry, _MM_SHUFFLE(0,3,2,1));
1061         srz = _mm_shuffle_ps(rz, rz, _MM_SHUFFLE(0,3,2,1));
1062
1063         gx = sry*rz - srz*ry;
1064         gy = srz*rx - srx*rz;
1065         gz = srx*ry - sry*rx;
1066
1067         /* normalize g */
1068         glen = _mm_rsqrt_ps(gx*gx + gy*gy + gz*gz + _mm_set_ps1(1e-16f));
1069         gx = gx*glen;
1070         gy = gy*glen;
1071         gz = gz*glen;
1072
1073         /* compute angle */
1074         rcos = rx*srx + ry*sry + rz*srz;
1075         rcos= _mm_max_ps(_mm_min_ps(rcos, _mm_set_ps1(1.0f)), _mm_set_ps1(-1.0f));
1076
1077         angle = sse_approx_cos(rcos);
1078         aresult = (_mm_set_ps1(n[0])*gx + _mm_set_ps1(n[1])*gy + _mm_set_ps1(n[2])*gz)*angle;
1079
1080         /* sum together */
1081     result= (fresult[0] + fresult[1] + fresult[2] + fresult[3])*(0.5f/(float)M_PI);
1082         result= MAX2(result, 0.0f);
1083
1084         return result;
1085 }
1086
1087 #endif
1088
1089 static void normalizef(float *n)
1090 {
1091         float d;
1092         
1093         d= INPR(n, n);
1094
1095         if(d > 1.0e-35F) {
1096                 d= 1.0f/sqrtf(d);
1097
1098                 n[0] *= d; 
1099                 n[1] *= d; 
1100                 n[2] *= d;
1101         } 
1102 }
1103
1104 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1105 {
1106         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1107         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1108
1109         VECSUB(r0, q0, p);
1110         VECSUB(r1, q1, p);
1111         VECSUB(r2, q2, p);
1112         VECSUB(r3, q3, p);
1113
1114         normalizef(r0);
1115         normalizef(r1);
1116         normalizef(r2);
1117         normalizef(r3);
1118
1119         Crossf(g0, r1, r0); normalizef(g0);
1120         Crossf(g1, r2, r1); normalizef(g1);
1121         Crossf(g2, r3, r2); normalizef(g2);
1122         Crossf(g3, r0, r3); normalizef(g3);
1123
1124         a1= saacosf(INPR(r0, r1));
1125         a2= saacosf(INPR(r1, r2));
1126         a3= saacosf(INPR(r2, r3));
1127         a4= saacosf(INPR(r3, r0));
1128
1129         dot1= INPR(n, g0);
1130         dot2= INPR(n, g1);
1131         dot3= INPR(n, g2);
1132         dot4= INPR(n, g3);
1133
1134         result= (a1*dot1 + a2*dot2 + a3*dot3 + a4*dot4)*0.5f/(float)M_PI;
1135         result= MAX2(result, 0.0f);
1136
1137         return result;
1138 }
1139
1140 static float occ_form_factor(OccFace *face, float *p, float *n)
1141 {
1142         ObjectInstanceRen *obi;
1143         VlakRen *vlr;
1144         float v1[3], v2[3], v3[3], v4[3], q0[3], q1[3], q2[3], q3[3], contrib= 0.0f;
1145
1146         obi= &R.objectinstance[face->obi];
1147         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
1148
1149         VECCOPY(v1, vlr->v1->co);
1150         VECCOPY(v2, vlr->v2->co);
1151         VECCOPY(v3, vlr->v3->co);
1152
1153         if(obi->flag & R_TRANSFORMED) {
1154                 Mat4MulVecfl(obi->mat, v1);
1155                 Mat4MulVecfl(obi->mat, v2);
1156                 Mat4MulVecfl(obi->mat, v3);
1157         }
1158
1159         if(occ_visible_quad(p, n, v1, v2, v3, q0, q1, q2, q3))
1160                 contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1161
1162         if(vlr->v4) {
1163                 VECCOPY(v4, vlr->v4->co);
1164                 if(obi->flag & R_TRANSFORMED)
1165                         Mat4MulVecfl(obi->mat, v4);
1166
1167                 if(occ_visible_quad(p, n, v1, v3, v4, q0, q1, q2, q3))
1168                         contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1169         }
1170
1171         return contrib;
1172 }
1173
1174 static void occ_lookup(OcclusionTree *tree, int thread, OccFace *exclude, float *pp, float *pn, float *occ, float *bentn)
1175 {
1176         OccNode *node, **stack;
1177         OccFace *face;
1178         float resultocc, v[3], p[3], n[3], co[3], invd2;
1179         float distfac, fac, error, d2, weight, emitarea;
1180         int b, totstack;
1181
1182         /* init variables */
1183         VECCOPY(p, pp);
1184         VECCOPY(n, pn);
1185         VECADDFAC(p, p, n, 1e-4f);
1186
1187         if(bentn)
1188                 VECCOPY(bentn, n);
1189         
1190         error= tree->error;
1191         distfac= tree->distfac;
1192
1193         resultocc= 0.0f;
1194
1195         /* init stack */
1196         stack= tree->stack[thread];
1197         stack[0]= tree->root;
1198         totstack= 1;
1199
1200         while(totstack) {
1201                 /* pop point off the stack */
1202                 node= stack[--totstack];
1203
1204                 VECSUB(v, node->co, p);
1205                 d2= INPR(v, v) + 1e-16f;
1206                 emitarea= MAX2(node->area, node->dco);
1207
1208                 if(d2*error > emitarea) {
1209                         if(distfac != 0.0f) {
1210                                 fac= 1.0f/(1.0f + distfac*d2);
1211                                 if(fac < 0.01f)
1212                                         continue;
1213                         }
1214                         else
1215                                 fac= 1.0f;
1216
1217                         /* accumulate occlusion from spherical harmonics */
1218                         invd2 = 1.0f/sqrtf(d2);
1219                         weight= occ_solid_angle(node, v, d2, invd2, n);
1220                         weight *= node->occlusion;
1221
1222                         if(bentn) {
1223                                 bentn[0] -= weight*invd2*v[0];
1224                                 bentn[1] -= weight*invd2*v[1];
1225                                 bentn[2] -= weight*invd2*v[2];
1226                         }
1227
1228                         resultocc += weight*fac;
1229                 }
1230                 else {
1231                         /* traverse into children */
1232                         for(b=0; b<TOTCHILD; b++) {
1233                                 if(node->childflag & (1<<b)) {
1234                                         face= tree->face+node->child[b].face;
1235
1236                                         /* accumulate occlusion with face form factor */
1237                                         if(!exclude || !(face->obi == exclude->obi && face->facenr == exclude->facenr)) {
1238                                                 if(bentn || distfac != 0.0f) {
1239                                                         occ_face(face, co, NULL, NULL); 
1240                                                         VECSUB(v, co, p);
1241                                                         d2= INPR(v, v) + 1e-16f;
1242
1243                                                         fac= (distfac == 0.0f)? 1.0f: 1.0f/(1.0f + distfac*d2);
1244                                                         if(fac < 0.01f)
1245                                                                 continue;
1246                                                 }
1247                                                 else
1248                                                         fac= 1.0f;
1249
1250                                                 weight= occ_form_factor(face, p, n);
1251                                                 weight *= tree->occlusion[node->child[b].face];
1252
1253                                                 if(bentn) {
1254                                                         invd2= 1.0f/sqrtf(d2);
1255                                                         bentn[0] -= weight*invd2*v[0];
1256                                                         bentn[1] -= weight*invd2*v[1];
1257                                                         bentn[2] -= weight*invd2*v[2];
1258                                                 }
1259
1260                                                 resultocc += weight*fac;
1261                                         }
1262                                 }
1263                                 else if(node->child[b].node) {
1264                                         /* push child on the stack */
1265                                         stack[totstack++]= node->child[b].node;
1266                                 }
1267                         }
1268                 }
1269         }
1270
1271         if(occ) *occ= resultocc;
1272         if(bentn) Normalize(bentn);
1273 }
1274
1275 static void occ_compute_passes(Render *re, OcclusionTree *tree, int totpass)
1276 {
1277         float *occ, co[3], n[3];
1278         int pass, i;
1279         
1280         occ= MEM_callocN(sizeof(float)*tree->totface, "OcclusionPassOcc");
1281
1282         for(pass=0; pass<totpass; pass++) {
1283                 for(i=0; i<tree->totface; i++) {
1284                         occ_face(&tree->face[i], co, n, NULL);
1285                         VecNegf(n);
1286                         VECADDFAC(co, co, n, 1e-8f);
1287
1288                         occ_lookup(tree, 0, &tree->face[i], co, n, &occ[i], NULL);
1289                         if(re->test_break(re->tbh))
1290                                 break;
1291                 }
1292
1293                 if(re->test_break(re->tbh))
1294                         break;
1295
1296                 for(i=0; i<tree->totface; i++) {
1297                         tree->occlusion[i] -= occ[i]; //MAX2(1.0f-occ[i], 0.0f);
1298                         if(tree->occlusion[i] < 0.0f)
1299                                 tree->occlusion[i]= 0.0f;
1300                 }
1301
1302                 occ_sum_occlusion(tree, tree->root);
1303         }
1304
1305         MEM_freeN(occ);
1306 }
1307
1308 static void sample_occ_tree(Render *re, OcclusionTree *tree, OccFace *exclude, float *co, float *n, int thread, int onlyshadow, float *skycol)
1309 {
1310         float nn[3], bn[3], fac, occ, occlusion, correction;
1311         int aocolor;
1312
1313         aocolor= re->wrld.aocolor;
1314         if(onlyshadow)
1315                 aocolor= WO_AOPLAIN;
1316
1317         VECCOPY(nn, n);
1318         VecNegf(nn);
1319
1320         occ_lookup(tree, thread, exclude, co, nn, &occ, (aocolor)? bn: NULL);
1321
1322         correction= re->wrld.ao_approx_correction;
1323
1324         occlusion= (1.0f-correction)*(1.0f-occ);
1325         CLAMP(occlusion, 0.0f, 1.0f);
1326         if(correction != 0.0f)
1327                 occlusion += correction*exp(-occ);
1328
1329         if(aocolor) {
1330                 /* sky shading using bent normal */
1331                 if(ELEM(aocolor, WO_AOSKYCOL, WO_AOSKYTEX)) {
1332                         fac= 0.5*(1.0f+bn[0]*re->grvec[0]+ bn[1]*re->grvec[1]+ bn[2]*re->grvec[2]);
1333                         skycol[0]= (1.0f-fac)*re->wrld.horr + fac*re->wrld.zenr;
1334                         skycol[1]= (1.0f-fac)*re->wrld.horg + fac*re->wrld.zeng;
1335                         skycol[2]= (1.0f-fac)*re->wrld.horb + fac*re->wrld.zenb;
1336                 }
1337 #if 0
1338                 else {  /* WO_AOSKYTEX */
1339                         float dxyview[3];
1340                         bn[0]= -bn[0];
1341                         bn[1]= -bn[1];
1342                         bn[2]= -bn[2];
1343                         dxyview[0]= 1.0f;
1344                         dxyview[1]= 1.0f;
1345                         dxyview[2]= 0.0f;
1346                         shadeSkyView(skycol, co, bn, dxyview);
1347                 }
1348 #endif
1349
1350                 VecMulf(skycol, occlusion);
1351         }
1352         else {
1353                 skycol[0]= occlusion;
1354                 skycol[1]= occlusion;
1355                 skycol[2]= occlusion;
1356         }
1357 }
1358
1359 /* ---------------------------- Caching ------------------------------- */
1360
1361 static OcclusionCacheSample *find_occ_sample(OcclusionCache *cache, int x, int y)
1362 {
1363         x -= cache->x;
1364         y -= cache->y;
1365
1366         x /= cache->step;
1367         y /= cache->step;
1368         x *= cache->step;
1369         y *= cache->step;
1370
1371         if(x < 0 || x >= cache->w || y < 0 || y >= cache->h)
1372                 return NULL;
1373         else
1374                 return &cache->sample[y*cache->w + x];
1375 }
1376
1377 static int sample_occ_cache(OcclusionTree *tree, float *co, float *n, int x, int y, int thread, float *col)
1378 {
1379         OcclusionCache *cache;
1380         OcclusionCacheSample *samples[4], *sample;
1381         float wn[4], wz[4], wb[4], tx, ty, w, totw, mino, maxo;
1382         float d[3], dist2;
1383         int i, x1, y1, x2, y2;
1384
1385         if(!tree->cache)
1386                 return 0;
1387         
1388         /* first try to find a sample in the same pixel */
1389         cache= &tree->cache[thread];
1390
1391         if(cache->sample && cache->step) {
1392                 sample= &cache->sample[(y-cache->y)*cache->w + (x-cache->x)];
1393                 if(sample->filled) {
1394                         VECSUB(d, sample->co, co);
1395                         dist2= INPR(d, d);
1396                         if(dist2 < 0.5f*sample->dist2 && INPR(sample->n, n) > 0.98f) {
1397                                 VECCOPY(col, sample->col);
1398                                 return 1;
1399                         }
1400                 }
1401         }
1402         else
1403                 return 0;
1404
1405         /* try to interpolate between 4 neighbouring pixels */
1406         samples[0]= find_occ_sample(cache, x, y);
1407         samples[1]= find_occ_sample(cache, x+cache->step, y);
1408         samples[2]= find_occ_sample(cache, x, y+cache->step);
1409         samples[3]= find_occ_sample(cache, x+cache->step, y+cache->step);
1410
1411         for(i=0; i<4; i++)
1412                 if(!samples[i] || !samples[i]->filled)
1413                         return 0;
1414
1415         /* require intensities not being too different */
1416         mino= MIN4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1417         maxo= MAX4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1418
1419         if(maxo - mino > 0.05f)
1420                 return 0;
1421
1422         /* compute weighted interpolation between samples */
1423         col[0]= col[1]= col[2]= 0.0f;
1424         totw= 0.0f;
1425
1426         x1= samples[0]->x;
1427         y1= samples[0]->y;
1428         x2= samples[3]->x;
1429         y2= samples[3]->y;
1430
1431         tx= (float)(x2 - x)/(float)(x2 - x1);
1432         ty= (float)(y2 - y)/(float)(y2 - y1);
1433
1434         wb[3]= (1.0f-tx)*(1.0f-ty);
1435         wb[2]= (tx)*(1.0f-ty);
1436         wb[1]= (1.0f-tx)*(ty);
1437         wb[0]= tx*ty;
1438
1439         for(i=0; i<4; i++) {
1440                 VECSUB(d, samples[i]->co, co);
1441                 dist2= INPR(d, d);
1442
1443                 wz[i]= 1.0f; //(samples[i]->dist2/(1e-4f + dist2));
1444                 wn[i]= pow(INPR(samples[i]->n, n), 32.0f);
1445
1446                 w= wb[i]*wn[i]*wz[i];
1447
1448                 totw += w;
1449                 col[0] += w*samples[i]->col[0];
1450                 col[1] += w*samples[i]->col[1];
1451                 col[2] += w*samples[i]->col[2];
1452         }
1453
1454         if(totw >= 0.9f) {
1455                 totw= 1.0f/totw;
1456                 col[0] *= totw;
1457                 col[1] *= totw;
1458                 col[2] *= totw;
1459                 return 1;
1460         }
1461
1462         return 0;
1463 }
1464
1465 static void sample_occ_surface(ShadeInput *shi)
1466 {
1467         StrandRen *strand= shi->strand;
1468         StrandSurface *mesh= strand->buffer->surface;
1469         int *face, *index = RE_strandren_get_face(shi->obr, strand, 0);
1470         float w[4], *co1, *co2, *co3, *co4;
1471
1472         if(mesh && mesh->face && mesh->co && mesh->col && index) {
1473                 face= mesh->face[*index];
1474
1475                 co1= mesh->co[face[0]];
1476                 co2= mesh->co[face[1]];
1477                 co3= mesh->co[face[2]];
1478                 co4= (face[3])? mesh->co[face[3]]: NULL;
1479
1480                 InterpWeightsQ3Dfl(co1, co2, co3, co4, strand->vert->co, w);
1481
1482                 shi->ao[0]= shi->ao[1]= shi->ao[2]= 0.0f;
1483                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[0]], w[0]);
1484                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[1]], w[1]);
1485                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[2]], w[2]);
1486                 if(face[3])
1487                         VECADDFAC(shi->ao, shi->ao, mesh->col[face[3]], w[3]);
1488         }
1489         else {
1490                 shi->ao[0]= 1.0f;
1491                 shi->ao[1]= 1.0f;
1492                 shi->ao[2]= 1.0f;
1493         }
1494 }
1495
1496 /* ------------------------- External Functions --------------------------- */
1497
1498 static void *exec_strandsurface_sample(void *data)
1499 {
1500         OcclusionThread *othread= (OcclusionThread*)data;
1501         Render *re= othread->re;
1502         StrandSurface *mesh= othread->mesh;
1503         float col[3], co[3], n[3], *co1, *co2, *co3, *co4;
1504         int a, *face;
1505
1506         for(a=othread->begin; a<othread->end; a++) {
1507                 face= mesh->face[a];
1508                 co1= mesh->co[face[0]];
1509                 co2= mesh->co[face[1]];
1510                 co3= mesh->co[face[2]];
1511
1512                 if(face[3]) {
1513                         co4= mesh->co[face[3]];
1514
1515                         VecLerpf(co, co1, co3, 0.5f);
1516                         CalcNormFloat4(co1, co2, co3, co4, n);
1517                 }
1518                 else {
1519                         CalcCent3f(co, co1, co2, co3);
1520                         CalcNormFloat(co1, co2, co3, n);
1521                 }
1522                 VecNegf(n);
1523
1524                 sample_occ_tree(re, re->occlusiontree, NULL, co, n, othread->thread, 0, col);
1525                 VECCOPY(othread->facecol[a], col);
1526         }
1527
1528         return 0;
1529 }
1530
1531 void make_occ_tree(Render *re)
1532 {
1533         OcclusionThread othreads[BLENDER_MAX_THREADS];
1534         StrandSurface *mesh;
1535         ListBase threads;
1536         float col[3], (*facecol)[3];
1537         int a, totface, totthread, *face, *count;
1538
1539         /* ugly, needed for occ_face */
1540         R= *re;
1541
1542         re->i.infostr= "Occlusion preprocessing";
1543         re->stats_draw(re->sdh, &re->i);
1544         
1545         re->occlusiontree= occ_tree_build(re);
1546         
1547         if(re->occlusiontree) {
1548                 if(re->wrld.ao_approx_passes)
1549                         occ_compute_passes(re, re->occlusiontree, re->wrld.ao_approx_passes);
1550
1551                 for(mesh=re->strandsurface.first; mesh; mesh=mesh->next) {
1552                         if(!mesh->face || !mesh->co || !mesh->col)
1553                                 continue;
1554
1555                         count= MEM_callocN(sizeof(int)*mesh->totvert, "OcclusionCount");
1556                         facecol= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceCol");
1557
1558                         totthread= (mesh->totface > 10000)? re->r.threads: 1;
1559                         totface= mesh->totface/totthread;
1560                         for(a=0; a<totthread; a++) {
1561                                 othreads[a].re= re;
1562                                 othreads[a].facecol= facecol;
1563                                 othreads[a].thread= a;
1564                                 othreads[a].mesh= mesh;
1565                                 othreads[a].begin= a*totface;
1566                                 othreads[a].end= (a == totthread-1)? mesh->totface: (a+1)*totface;
1567                         }
1568
1569                         if(totthread == 1) {
1570                                 exec_strandsurface_sample(&othreads[0]);
1571                         }
1572                         else {
1573                                 BLI_init_threads(&threads, exec_strandsurface_sample, totthread);
1574
1575                                 for(a=0; a<totthread; a++)
1576                                         BLI_insert_thread(&threads, &othreads[a]);
1577
1578                                 BLI_end_threads(&threads);
1579                         }
1580
1581                         for(a=0; a<mesh->totface; a++) {
1582                                 face= mesh->face[a];
1583
1584                                 VECCOPY(col, facecol[a]);
1585                                 VECADD(mesh->col[face[0]], mesh->col[face[0]], col);
1586                                 count[face[0]]++;
1587                                 VECADD(mesh->col[face[1]], mesh->col[face[1]], col);
1588                                 count[face[1]]++;
1589                                 VECADD(mesh->col[face[2]], mesh->col[face[2]], col);
1590                                 count[face[2]]++;
1591
1592                                 if(face[3]) {
1593                                         VECADD(mesh->col[face[3]], mesh->col[face[3]], col);
1594                                         count[face[3]]++;
1595                                 }
1596                         }
1597
1598                         for(a=0; a<mesh->totvert; a++)
1599                                 if(count[a])
1600                                         VecMulf(mesh->col[a], 1.0f/count[a]);
1601
1602                         MEM_freeN(count);
1603                         MEM_freeN(facecol);
1604                 }
1605         }
1606 }
1607
1608 void free_occ(Render *re)
1609 {
1610         if(re->occlusiontree) {
1611                 occ_free_tree(re->occlusiontree);
1612                 re->occlusiontree = NULL;
1613         }
1614 }
1615
1616 void sample_occ(Render *re, ShadeInput *shi)
1617 {
1618         OcclusionTree *tree= re->occlusiontree;
1619         OcclusionCache *cache;
1620         OcclusionCacheSample *sample;
1621         OccFace exclude;
1622         int onlyshadow;
1623
1624         if(tree) {
1625                 if(shi->strand) {
1626                         sample_occ_surface(shi);
1627                 }
1628                 /* try to get result from the cache if possible */
1629                 else if(shi->depth!=0 || !sample_occ_cache(tree, shi->co, shi->vno, shi->xs, shi->ys, shi->thread, shi->ao)) {
1630                         /* no luck, let's sample the occlusion */
1631                         exclude.obi= shi->obi - re->objectinstance;
1632                         exclude.facenr= shi->vlr->index;
1633                         onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1634                         sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1635
1636                         /* fill result into sample, each time */
1637                         if(tree->cache) {
1638                                 cache= &tree->cache[shi->thread];
1639
1640                                 if(cache->sample && cache->step) {
1641                                         sample= &cache->sample[(shi->ys-cache->y)*cache->w + (shi->xs-cache->x)];
1642                                         VECCOPY(sample->co, shi->co);
1643                                         VECCOPY(sample->n, shi->vno);
1644                                         VECCOPY(sample->col, shi->ao);
1645                                         sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1646                                         sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1647                                         sample->filled= 1;
1648                                 }
1649                         }
1650                 }
1651         }
1652         else {
1653                 shi->ao[0]= 1.0f;
1654                 shi->ao[1]= 1.0f;
1655                 shi->ao[2]= 1.0f;
1656         }
1657 }
1658
1659 void cache_occ_samples(Render *re, RenderPart *pa, ShadeSample *ssamp)
1660 {
1661         OcclusionTree *tree= re->occlusiontree;
1662         PixStr ps;
1663         OcclusionCache *cache;
1664         OcclusionCacheSample *sample;
1665         OccFace exclude;
1666         ShadeInput *shi;
1667         intptr_t *rd=NULL;
1668         int *ro=NULL, *rp=NULL, *rz=NULL, onlyshadow;
1669         int x, y, step = CACHE_STEP;
1670
1671         if(!tree->cache)
1672                 return;
1673
1674         cache= &tree->cache[pa->thread];
1675         cache->w= pa->rectx;
1676         cache->h= pa->recty;
1677         cache->x= pa->disprect.xmin;
1678         cache->y= pa->disprect.ymin;
1679         cache->step= step;
1680         cache->sample= MEM_callocN(sizeof(OcclusionCacheSample)*cache->w*cache->h, "OcclusionCacheSample");
1681         sample= cache->sample;
1682
1683         if(re->osa) {
1684                 rd= pa->rectdaps;
1685         }
1686         else {
1687                 /* fake pixel struct for non-osa */
1688                 ps.next= NULL;
1689                 ps.mask= 0xFFFF;
1690
1691                 ro= pa->recto;
1692                 rp= pa->rectp;
1693                 rz= pa->rectz;
1694         }
1695
1696         /* compute a sample at every step pixels */
1697         for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++) {
1698                 for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, sample++, rd++, ro++, rp++, rz++) {
1699                         if(!(((x - pa->disprect.xmin + step) % step) == 0 || x == pa->disprect.xmax-1))
1700                                 continue;
1701                         if(!(((y - pa->disprect.ymin + step) % step) == 0 || y == pa->disprect.ymax-1))
1702                                 continue;
1703
1704                         if(re->osa) {
1705                                 if(!*rd) continue;
1706
1707                                 shade_samples_fill_with_ps(ssamp, (PixStr *)(*rd), x, y);
1708                         }
1709                         else {
1710                                 if(!*rp) continue;
1711
1712                                 ps.obi= *ro;
1713                                 ps.facenr= *rp;
1714                                 ps.z= *rz;
1715                                 shade_samples_fill_with_ps(ssamp, &ps, x, y);
1716                         }
1717
1718                         shi= ssamp->shi;
1719                         if(shi->vlr) {
1720                                 onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1721                                 exclude.obi= shi->obi - re->objectinstance;
1722                                 exclude.facenr= shi->vlr->index;
1723                                 sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1724
1725                                 VECCOPY(sample->co, shi->co);
1726                                 VECCOPY(sample->n, shi->vno);
1727                                 VECCOPY(sample->col, shi->ao);
1728                                 sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1729                                 sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1730                                 sample->x= shi->xs;
1731                                 sample->y= shi->ys;
1732                                 sample->filled= 1;
1733                         }
1734
1735                         if(re->test_break(re->tbh))
1736                                 break;
1737                 }
1738         }
1739 }
1740
1741 void free_occ_samples(Render *re, RenderPart *pa)
1742 {
1743         OcclusionTree *tree= re->occlusiontree;
1744         OcclusionCache *cache;
1745
1746         if(tree->cache) {
1747                 cache= &tree->cache[pa->thread];
1748
1749                 if(cache->sample)
1750                         MEM_freeN(cache->sample);
1751
1752                 cache->w= 0;
1753                 cache->h= 0;
1754                 cache->step= 0;
1755         }
1756 }
1757