- Bugfix: scenes with SSS materials slowed down preview render
[blender.git] / source / blender / render / intern / source / occlusion.c
1 /* 
2  * $Id$
3  *
4  * ***** BEGIN GPL LICENSE BLOCK *****
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software Foundation,
18  * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19  *
20  * The Original Code is Copyright (C) 2008 Blender Foundation.
21  * All rights reserved.
22  *
23  * The Original Code is: all of this file.
24  *
25  * Contributor(s): Brecht Van Lommel.
26  *
27  * ***** END GPL LICENSE BLOCK *****
28  */
29
30 #include <math.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "MEM_guardedalloc.h"
36
37 #include "DNA_material_types.h"
38
39 #include "BLI_arithb.h"
40 #include "BLI_blenlib.h"
41 #include "BLI_memarena.h"
42 #include "BLI_threads.h"
43
44 #include "BKE_global.h"
45 #include "BKE_utildefines.h"
46
47 #include "RE_shader_ext.h"
48
49 /* local includes */
50 #include "occlusion.h"
51 #include "render_types.h"
52 #include "rendercore.h"
53 #include "renderdatabase.h"
54 #include "pixelshading.h"
55 #include "shading.h"
56 #include "zbuf.h"
57
58 /* ------------------------- Declarations --------------------------- */
59
60 #define INVALID_INDEX ((int)(~0))
61 #define INVPI 0.31830988618379069f
62 #define TOTCHILD 8
63 #define CACHE_STEP 3
64
65 typedef struct OcclusionCacheSample {
66         float co[3], n[3], col[3], intensity, dist2;
67         int x, y, filled;
68 } OcclusionCacheSample;
69
70 typedef struct OcclusionCache {
71         OcclusionCacheSample *sample;
72         int x, y, w, h, step;
73 } OcclusionCache;
74
75 typedef struct OccFace {
76         int obi;
77         int facenr;
78 } OccFace;
79
80 typedef struct OccNode {
81         float co[3], area;
82         float sh[9], dco;
83         float occlusion;
84         int childflag;
85         union {
86                 //OccFace face;
87                 int face;
88                 struct OccNode *node;
89         } child[TOTCHILD];
90 } OccNode;
91
92 typedef struct OcclusionTree {
93         MemArena *arena;
94
95         float (*co)[3];         /* temporary during build */
96
97         OccFace *face;          /* instance and face indices */
98         float *occlusion;       /* occlusion for faces */
99         
100         OccNode *root;
101
102         OccNode **stack[BLENDER_MAX_THREADS];
103         int maxdepth;
104
105         int totface;
106
107         float error;
108         float distfac;
109
110         int dothreadedbuild;
111         int totbuildthread;
112
113         OcclusionCache *cache;
114 } OcclusionTree;
115
116 typedef struct OcclusionThread {
117         Render *re;
118         StrandSurface *mesh;
119         float (*facecol)[3];
120         int begin, end;
121         int thread;
122 } OcclusionThread;
123
124 typedef struct OcclusionBuildThread {
125         OcclusionTree *tree;
126         int begin, end, depth;
127         OccNode *node;
128 } OcclusionBuildThread;
129
130 /* ------------------------- Shading --------------------------- */
131
132 extern Render R; // meh
133
134 #if 0
135 static void occ_shade(ShadeSample *ssamp, ObjectInstanceRen *obi, VlakRen *vlr, float *rad)
136 {
137         ShadeInput *shi= ssamp->shi;
138         ShadeResult *shr= ssamp->shr;
139         float l, u, v, *v1, *v2, *v3;
140         
141         /* init */
142         if(vlr->v4) {
143                 shi->u= u= 0.5f;
144                 shi->v= v= 0.5f;
145         }
146         else {
147                 shi->u= u= 1.0f/3.0f;
148                 shi->v= v= 1.0f/3.0f;
149         }
150
151         /* setup render coordinates */
152         v1= vlr->v1->co;
153         v2= vlr->v2->co;
154         v3= vlr->v3->co;
155         
156         /* renderco */
157         l= 1.0f-u-v;
158         
159         shi->co[0]= l*v3[0]+u*v1[0]+v*v2[0];
160         shi->co[1]= l*v3[1]+u*v1[1]+v*v2[1];
161         shi->co[2]= l*v3[2]+u*v1[2]+v*v2[2];
162         
163         shade_input_set_triangle_i(shi, obi, vlr, 0, 1, 2);
164
165         /* set up view vector */
166         VECCOPY(shi->view, shi->co);
167         Normalize(shi->view);
168         
169         /* cache for shadow */
170         shi->samplenr++;
171         
172         shi->xs= 0; // TODO
173         shi->ys= 0;
174         
175         shade_input_set_normals(shi);
176
177         /* no normal flip */
178         if(shi->flippednor)
179                 shade_input_flip_normals(shi);
180
181         /* not a pretty solution, but fixes common cases */
182         if(shi->obr->ob && shi->obr->ob->transflag & OB_NEG_SCALE) {
183                 VecMulf(shi->vn, -1.0f);
184                 VecMulf(shi->vno, -1.0f);
185         }
186
187         /* init material vars */
188         // note, keep this synced with render_types.h
189         memcpy(&shi->r, &shi->mat->r, 23*sizeof(float));
190         shi->har= shi->mat->har;
191         
192         /* render */
193         shade_input_set_shade_texco(shi);
194         shade_material_loop(shi, shr); /* todo: nodes */
195         
196         VECCOPY(rad, shr->combined);
197 }
198
199 static void occ_build_shade(Render *re, OcclusionTree *tree)
200 {
201         ShadeSample ssamp;
202         ObjectInstanceRen *obi;
203         VlakRen *vlr;
204         int a;
205
206         R= *re;
207
208         /* setup shade sample with correct passes */
209         memset(&ssamp, 0, sizeof(ShadeSample));
210         ssamp.shi[0].lay= re->scene->lay;
211         ssamp.shi[0].passflag= SCE_PASS_DIFFUSE|SCE_PASS_RGBA;
212         ssamp.shi[0].combinedflag= ~(SCE_PASS_SPEC);
213         ssamp.tot= 1;
214
215         for(a=0; a<tree->totface; a++) {
216                 obi= &R.objectinstance[tree->face[a].obi];
217                 vlr= RE_findOrAddVlak(obi->obr, tree->face[a].vlr);
218
219                 occ_shade(&ssamp, obi, vlr, tree->rad[a]);
220         }
221 }
222 #endif
223
224 /* ------------------------- Spherical Harmonics --------------------------- */
225
226 /* Use 2nd order SH => 9 coefficients, stored in this order:
227    0 = (0,0),
228    1 = (1,-1), 2 = (1,0), 3 = (1,1),
229    4 = (2,-2), 5 = (2,-1), 6 = (2,0), 7 = (2,1), 8 = (2,2) */
230
231 static void sh_copy(float *shresult, float *sh)
232 {
233         memcpy(shresult, sh, sizeof(float)*9);
234 }
235
236 static void sh_mul(float *sh, float f)
237 {
238         int i;
239
240         for(i=0; i<9; i++)
241                 sh[i] *= f;
242 }
243
244 static void sh_add(float *shresult, float *sh1, float *sh2)
245 {
246         int i;
247
248         for(i=0; i<9; i++)
249                 shresult[i]= sh1[i] + sh2[i];
250 }
251
252 static void sh_from_disc(float *n, float area, float *shresult)
253 {
254         /* See formula (3) in:
255            "An Efficient Representation for Irradiance Environment Maps" */
256         float sh[9], x, y, z;
257
258         x= n[0];
259         y= n[1];
260         z= n[2];
261
262         sh[0]= 0.282095f;
263
264         sh[1]= 0.488603f*y;
265         sh[2]= 0.488603f*z;
266         sh[3]= 0.488603f*x;
267         
268         sh[4]= 1.092548f*x*y;
269         sh[5]= 1.092548f*y*z;
270         sh[6]= 0.315392f*(3.0f*z*z - 1.0f);
271         sh[7]= 1.092548f*x*z;
272         sh[8]= 0.546274f*(x*x - y*y);
273
274         sh_mul(sh, area);
275         sh_copy(shresult, sh);
276 }
277
278 static float sh_eval(float *sh, float *v)
279 {
280         /* See formula (13) in:
281            "An Efficient Representation for Irradiance Environment Maps" */
282         static const float c1 = 0.429043f, c2 = 0.511664f, c3 = 0.743125f;
283         static const float c4 = 0.886227f, c5 = 0.247708f;
284         float x, y, z, sum;
285
286         x= v[0];
287         y= v[1];
288         z= v[2];
289
290         sum= c1*sh[8]*(x*x - y*y);
291         sum += c3*sh[6]*z*z;
292         sum += c4*sh[0];
293         sum += -c5*sh[6];
294         sum += 2.0f*c1*(sh[4]*x*y + sh[7]*x*z + sh[5]*y*z);
295         sum += 2.0f*c2*(sh[3]*x + sh[1]*y + sh[2]*z);
296
297         return sum;
298 }
299
300 /* ------------------------------ Building --------------------------------- */
301
302 static void occ_face(const OccFace *face, float *co, float *normal, float *area)
303 {
304         ObjectInstanceRen *obi;
305         VlakRen *vlr;
306         float v1[3], v2[3], v3[3], v4[3];
307
308         obi= &R.objectinstance[face->obi];
309         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
310         
311         if(co) {
312                 if(vlr->v4)
313                         VecLerpf(co, vlr->v1->co, vlr->v3->co, 0.5f);
314                 else
315                         CalcCent3f(co, vlr->v1->co, vlr->v2->co, vlr->v3->co);
316
317                 if(obi->flag & R_TRANSFORMED)
318                         Mat4MulVecfl(obi->mat, co);
319         }
320         
321         if(normal) {
322                 normal[0]= -vlr->n[0];
323                 normal[1]= -vlr->n[1];
324                 normal[2]= -vlr->n[2];
325
326                 if(obi->flag & R_TRANSFORMED) {
327                         float xn, yn, zn, (*imat)[3]= obi->imat;
328
329                         xn= normal[0];
330                         yn= normal[1];
331                         zn= normal[2];
332
333                         normal[0]= imat[0][0]*xn + imat[0][1]*yn + imat[0][2]*zn;
334                         normal[1]= imat[1][0]*xn + imat[1][1]*yn + imat[1][2]*zn;
335                         normal[2]= imat[2][0]*xn + imat[2][1]*yn + imat[2][2]*zn;
336                 }
337         }
338
339         if(area) {
340                 VECCOPY(v1, vlr->v1->co);
341                 VECCOPY(v2, vlr->v2->co);
342                 VECCOPY(v3, vlr->v3->co);
343                 if(vlr->v4) VECCOPY(v4, vlr->v4->co);
344
345                 if(obi->flag & R_TRANSFORMED) {
346                         Mat4MulVecfl(obi->mat, v1);
347                         Mat4MulVecfl(obi->mat, v2);
348                         Mat4MulVecfl(obi->mat, v3);
349                         if(vlr->v4) Mat4MulVecfl(obi->mat, v4);
350                 }
351
352                 /* todo: correct area for instances */
353                 if(vlr->v4)
354                         *area= AreaQ3Dfl(v1, v2, v3, v4);
355                 else
356                         *area= AreaT3Dfl(v1, v2, v3);
357         }
358 }
359
360 static void occ_sum_occlusion(OcclusionTree *tree, OccNode *node)
361 {
362         OccNode *child;
363         float occ, area, totarea;
364         int a, b;
365
366         occ= 0.0f;
367         totarea= 0.0f;
368
369         for(b=0; b<TOTCHILD; b++) {
370                 if(node->childflag & (1<<b)) {
371                         a= node->child[b].face;
372                         occ_face(&tree->face[a], 0, 0, &area);
373                         occ += area*tree->occlusion[a];
374                         totarea += area;
375                 }
376                 else if(node->child[b].node) {
377                         child= node->child[b].node;
378                         occ_sum_occlusion(tree, child);
379
380                         occ += child->area*child->occlusion;
381                         totarea += child->area;
382                 }
383         }
384
385         if(totarea != 0.0f)
386                 occ /= totarea;
387         
388         node->occlusion= occ;
389 }
390
391 static int occ_find_bbox_axis(OcclusionTree *tree, int begin, int end, float *min, float *max)
392 {
393         float len, maxlen= -1.0f;
394         int a, axis = 0;
395
396         INIT_MINMAX(min, max);
397
398         for(a=begin; a<end; a++)
399                 DO_MINMAX(tree->co[a], min, max)
400
401         for(a=0; a<3; a++) {
402                 len= max[a] - min[a];
403
404                 if(len > maxlen) {
405                         maxlen= len;
406                         axis= a;
407                 }
408         }
409
410         return axis;
411 }
412
413 void occ_node_from_face(OccFace *face, OccNode *node)
414 {
415         float n[3];
416
417         occ_face(face, node->co, n, &node->area);
418         node->dco= 0.0f;
419         sh_from_disc(n, node->area, node->sh);
420 }
421
422 static void occ_build_dco(OcclusionTree *tree, OccNode *node, float *co, float *dco)
423 {
424         OccNode *child;
425         float dist, d[3], nco[3];
426         int b;
427
428         for(b=0; b<TOTCHILD; b++) {
429                 if(node->childflag & (1<<b)) {
430                         occ_face(tree->face+node->child[b].face, nco, 0, 0);
431                 }
432                 else if(node->child[b].node) {
433                         child= node->child[b].node;
434                         occ_build_dco(tree, child, co, dco);
435                         VECCOPY(nco, child->co);
436                 }
437
438                 VECSUB(d, nco, co);
439                 dist= INPR(d, d);
440                 if(dist > *dco)
441                         *dco= dist;
442         }
443 }
444
445 static void occ_build_split(OcclusionTree *tree, int begin, int end, int *split)
446 {
447         float min[3], max[3], mid;
448         int axis, a, enda;
449
450         /* split in middle of boundbox. this seems faster than median split
451          * on complex scenes, possibly since it avoids two distant faces to
452          * be in the same node better? */
453         axis= occ_find_bbox_axis(tree, begin, end, min, max);
454         mid= 0.5f*(min[axis]+max[axis]);
455
456         a= begin;
457         enda= end;
458         while(a<enda) {
459                 if(tree->co[a][axis] > mid) {
460                         enda--;
461                         SWAP(OccFace, tree->face[a], tree->face[enda]);
462                         SWAP(float, tree->co[a][0], tree->co[enda][0]);
463                         SWAP(float, tree->co[a][1], tree->co[enda][1]);
464                         SWAP(float, tree->co[a][2], tree->co[enda][2]);
465                 }
466                 else
467                         a++;
468         }
469
470         *split= enda;
471 }
472
473 static void occ_build_8_split(OcclusionTree *tree, int begin, int end, int *offset, int *count)
474 {
475         /* split faces into eight groups */
476         int b, splitx, splity[2], splitz[4];
477
478         occ_build_split(tree, begin, end, &splitx);
479
480         /* force split if none found, to deal with degenerate geometry */
481         if(splitx == begin || splitx == end)
482                 splitx= (begin+end)/2;
483
484         occ_build_split(tree, begin, splitx, &splity[0]);
485         occ_build_split(tree, splitx, end, &splity[1]);
486
487         occ_build_split(tree, begin, splity[0], &splitz[0]);
488         occ_build_split(tree, splity[0], splitx, &splitz[1]);
489         occ_build_split(tree, splitx, splity[1], &splitz[2]);
490         occ_build_split(tree, splity[1], end, &splitz[3]);
491
492         offset[0]= begin;
493         offset[1]= splitz[0];
494         offset[2]= splity[0];
495         offset[3]= splitz[1];
496         offset[4]= splitx;
497         offset[5]= splitz[2];
498         offset[6]= splity[1];
499         offset[7]= splitz[3];
500
501         for(b=0; b<7; b++)
502                 count[b]= offset[b+1] - offset[b];
503         count[7]= end - offset[7];
504 }
505
506 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth);
507
508 static void *exec_occ_build(void *data)
509 {
510         OcclusionBuildThread *othread= (OcclusionBuildThread*)data;
511
512         occ_build_recursive(othread->tree, othread->node, othread->begin, othread->end, othread->depth);
513
514         return 0;
515 }
516
517 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth)
518 {
519         ListBase threads;
520         OcclusionBuildThread othreads[BLENDER_MAX_THREADS];
521         OccNode *child, tmpnode;
522         OccFace *face;
523         int a, b, totthread=0, offset[TOTCHILD], count[TOTCHILD];
524
525         /* keep track of maximum depth for stack */
526         if(depth > tree->maxdepth)
527                 tree->maxdepth= depth;
528
529         /* add a new node */
530         node->occlusion= 1.0f;
531
532         /* leaf node with only children */
533         if(end - begin <= TOTCHILD) {
534                 for(a=begin, b=0; a<end; a++, b++) {
535                         face= &tree->face[a];
536                         node->child[b].face= a;
537                         node->childflag |= (1<<b);
538                 }
539         }
540         else {
541                 /* order faces */
542                 occ_build_8_split(tree, begin, end, offset, count);
543
544                 if(depth == 1 && tree->dothreadedbuild)
545                         BLI_init_threads(&threads, exec_occ_build, tree->totbuildthread);
546
547                 for(b=0; b<TOTCHILD; b++) {
548                         if(count[b] == 0) {
549                                 node->child[b].node= NULL;
550                         }
551                         else if(count[b] == 1) {
552                                 face= &tree->face[offset[b]];
553                                 node->child[b].face= offset[b];
554                                 node->childflag |= (1<<b);
555                         }
556                         else {
557                                 if(tree->dothreadedbuild)
558                                         BLI_lock_thread(LOCK_CUSTOM1);
559
560                                 child= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
561                                 node->child[b].node= child;
562
563                                 if(tree->dothreadedbuild)
564                                         BLI_unlock_thread(LOCK_CUSTOM1);
565
566                                 if(depth == 1 && tree->dothreadedbuild) {
567                                         othreads[totthread].tree= tree;
568                                         othreads[totthread].node= child;
569                                         othreads[totthread].begin= offset[b];
570                                         othreads[totthread].end= offset[b]+count[b];
571                                         othreads[totthread].depth= depth+1;
572                                         BLI_insert_thread(&threads, &othreads[totthread]);
573                                         totthread++;
574                                 }
575                                 else
576                                         occ_build_recursive(tree, child, offset[b], offset[b]+count[b], depth+1);
577                         }
578                 }
579
580                 if(depth == 1 && tree->dothreadedbuild)
581                         BLI_end_threads(&threads);
582         }
583
584         /* combine area, position and sh */
585         for(b=0; b<TOTCHILD; b++) {
586                 if(node->childflag & (1<<b)) {
587                         child= &tmpnode;
588                         occ_node_from_face(tree->face+node->child[b].face, &tmpnode);
589                 }
590                 else {
591                         child= node->child[b].node;
592                 }
593
594                 if(child) {
595                         node->area += child->area;
596                         sh_add(node->sh, node->sh, child->sh);
597                         VECADDFAC(node->co, node->co, child->co, child->area);
598                 }
599         }
600
601         if(node->area != 0.0f)
602                 VecMulf(node->co, 1.0f/node->area);
603
604         /* compute maximum distance from center */
605         node->dco= 0.0f;
606         occ_build_dco(tree, node, node->co, &node->dco);
607 }
608
609 static void occ_build_sh_normalize(OccNode *node)
610 {
611         /* normalize spherical harmonics to not include area, so
612          * we can clamp the dot product and then mutliply by area */
613         int b;
614
615         if(node->area != 0.0f)
616                 sh_mul(node->sh, 1.0f/node->area);
617
618         for(b=0; b<TOTCHILD; b++) {
619                 if(node->childflag & (1<<b));
620                 else if(node->child[b].node)
621                         occ_build_sh_normalize(node->child[b].node);
622         }
623 }
624
625 static OcclusionTree *occ_tree_build(Render *re)
626 {
627         OcclusionTree *tree;
628         ObjectInstanceRen *obi;
629         ObjectRen *obr;
630         VlakRen *vlr= NULL;
631         int a, b, c, totface;
632
633         /* count */
634         totface= 0;
635         for(obi=re->instancetable.first; obi; obi=obi->next) {
636                 obr= obi->obr;
637                 for(a=0; a<obr->totvlak; a++) {
638                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
639                         else vlr++;
640
641                         if(vlr->mat->mode & MA_TRACEBLE)
642                                 totface++;
643                 }
644         }
645
646         if(totface == 0)
647                 return NULL;
648         
649         tree= MEM_callocN(sizeof(OcclusionTree), "OcclusionTree");
650         tree->totface= totface;
651
652         /* parameters */
653         tree->error= re->wrld.ao_approx_error;
654         tree->distfac= (re->wrld.aomode & WO_AODIST)? re->wrld.aodistfac: 0.0f;
655
656         /* allocation */
657         tree->arena= BLI_memarena_new(0x8000 * sizeof(OccNode));
658         BLI_memarena_use_calloc(tree->arena);
659
660         if(re->wrld.aomode & WO_AOCACHE)
661                 tree->cache= MEM_callocN(sizeof(OcclusionCache)*BLENDER_MAX_THREADS, "OcclusionCache");
662
663         tree->face= MEM_callocN(sizeof(OccFace)*totface, "OcclusionFace");
664         tree->co= MEM_callocN(sizeof(float)*3*totface, "OcclusionCo");
665         tree->occlusion= MEM_callocN(sizeof(float)*totface, "OcclusionOcclusion");
666
667         /* make array of face pointers */
668         for(b=0, c=0, obi=re->instancetable.first; obi; obi=obi->next, c++) {
669                 obr= obi->obr;
670                 for(a=0; a<obr->totvlak; a++) {
671                         if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
672                         else vlr++;
673
674                         if(vlr->mat->mode & MA_TRACEBLE) {
675                                 tree->face[b].obi= c;
676                                 tree->face[b].facenr= a;
677                                 tree->occlusion[b]= 1.0f;
678                                 occ_face(&tree->face[b], tree->co[b], NULL, NULL); 
679                                 b++;
680                         }
681                 }
682         }
683
684         /* threads */
685         tree->totbuildthread= (re->r.threads > 1 && totface > 10000)? 8: 1;
686         tree->dothreadedbuild= (tree->totbuildthread > 1);
687
688         /* recurse */
689         tree->root= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
690         occ_build_recursive(tree, tree->root, 0, totface, 1);
691
692 #if 0
693         if(tree->doindirect) {
694                 occ_build_shade(re, tree);
695                 occ_sum_occlusion(tree, tree->root);
696         }
697 #endif
698         
699         MEM_freeN(tree->co);
700         tree->co= NULL;
701
702         occ_build_sh_normalize(tree->root);
703
704         for(a=0; a<BLENDER_MAX_THREADS; a++)
705                 tree->stack[a]= MEM_callocN(sizeof(OccNode)*TOTCHILD*(tree->maxdepth+1), "OccStack");
706
707         return tree;
708 }
709
710 static void occ_free_tree(OcclusionTree *tree)
711 {
712         int a;
713
714         if(tree) {
715                 if(tree->arena) BLI_memarena_free(tree->arena);
716                 for(a=0; a<BLENDER_MAX_THREADS; a++)
717                         if(tree->stack[a])
718                                 MEM_freeN(tree->stack[a]);
719                 if(tree->occlusion) MEM_freeN(tree->occlusion);
720                 if(tree->face) MEM_freeN(tree->face);
721                 if(tree->cache) MEM_freeN(tree->cache);
722                 MEM_freeN(tree);
723         }
724 }
725
726 /* ------------------------- Traversal --------------------------- */
727
728 static float occ_solid_angle(OccNode *node, float *v, float d2, float invd2, float *receivenormal)
729 {
730         float dotreceive, dotemit;
731         float ev[3];
732
733         ev[0]= -v[0]*invd2;
734         ev[1]= -v[1]*invd2;
735         ev[2]= -v[2]*invd2;
736         dotemit= sh_eval(node->sh, ev);
737         dotreceive= INPR(receivenormal, v)*invd2;
738
739         CLAMP(dotemit, 0.0f, 1.0f);
740         CLAMP(dotreceive, 0.0f, 1.0f);
741         
742         return ((node->area*dotemit*dotreceive)/(d2 + node->area*INVPI))*INVPI;
743 }
744
745 static void VecAddDir(float *result, float *v1, float *v2, float fac)
746 {
747         result[0]= v1[0] + fac*(v2[0] - v1[0]);
748         result[1]= v1[1] + fac*(v2[1] - v1[1]);
749         result[2]= v1[2] + fac*(v2[2] - v1[2]);
750 }
751
752 static int occ_visible_quad(float *p, float *n, float *v0, float *v1, float *v2, float *q0, float *q1, float *q2, float *q3)
753 {
754         static const float epsilon = 1e-6f;
755         float c, sd[3];
756         
757         c= INPR(n, p);
758
759         /* signed distances from the vertices to the plane. */
760         sd[0]= INPR(n, v0) - c;
761         sd[1]= INPR(n, v1) - c;
762         sd[2]= INPR(n, v2) - c;
763
764         if(fabs(sd[0]) < epsilon) sd[0] = 0.0f;
765         if(fabs(sd[1]) < epsilon) sd[1] = 0.0f;
766         if(fabs(sd[2]) < epsilon) sd[2] = 0.0f;
767
768         if(sd[0] > 0) {
769                 if(sd[1] > 0) {
770                         if(sd[2] > 0) {
771                                 // +++
772                                 VECCOPY(q0, v0);
773                                 VECCOPY(q1, v1);
774                                 VECCOPY(q2, v2);
775                                 VECCOPY(q3, q2);
776                         }
777                         else if(sd[2] < 0) {
778                                 // ++-
779                                 VECCOPY(q0, v0);
780                                 VECCOPY(q1, v1);
781                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
782                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
783                         }
784                         else {
785                                 // ++0
786                                 VECCOPY(q0, v0);
787                                 VECCOPY(q1, v1);
788                                 VECCOPY(q2, v2);
789                                 VECCOPY(q3, q2);
790                         }
791                 }
792                 else if(sd[1] < 0) {
793                         if(sd[2] > 0) {
794                                 // +-+
795                                 VECCOPY(q0, v0);
796                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
797                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
798                                 VECCOPY(q3, v2);
799                         }
800                         else if(sd[2] < 0) {
801                                 // +--
802                                 VECCOPY(q0, v0);
803                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
804                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
805                                 VECCOPY(q3, q2);
806                         }
807                         else {
808                                 // +-0
809                                 VECCOPY(q0, v0);
810                                 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
811                                 VECCOPY(q2, v2);
812                                 VECCOPY(q3, q2);
813                         }
814                 }
815                 else {
816                         if(sd[2] > 0) {
817                                 // +0+
818                                 VECCOPY(q0, v0);
819                                 VECCOPY(q1, v1);
820                                 VECCOPY(q2, v2);
821                                 VECCOPY(q3, q2);
822                         }
823                         else if(sd[2] < 0) {
824                                 // +0-
825                                 VECCOPY(q0, v0);
826                                 VECCOPY(q1, v1);
827                                 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
828                                 VECCOPY(q3, q2);
829                         }
830                         else {
831                                 // +00
832                                 VECCOPY(q0, v0);
833                                 VECCOPY(q1, v1);
834                                 VECCOPY(q2, v2);
835                                 VECCOPY(q3, q2);
836                         }
837                 }
838         }
839         else if(sd[0] < 0) {
840                 if(sd[1] > 0) {
841                         if(sd[2] > 0) {
842                                 // -++
843                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
844                                 VECCOPY(q1, v1);
845                                 VECCOPY(q2, v2);
846                                 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
847                         }
848                         else if(sd[2] < 0) {
849                                 // -+-
850                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
851                                 VECCOPY(q1, v1);
852                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
853                                 VECCOPY(q3, q2);
854                         }
855                         else {
856                                 // -+0
857                                 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
858                                 VECCOPY(q1, v1);
859                                 VECCOPY(q2, v2);
860                                 VECCOPY(q3, q2);
861                         }
862                 }
863                 else if(sd[1] < 0) {
864                         if(sd[2] > 0) {
865                                 // --+
866                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
867                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
868                                 VECCOPY(q2, v2);
869                                 VECCOPY(q3, q2);
870                         }
871                         else if(sd[2] < 0) {
872                                 // ---
873                                 return 0;
874                         }
875                         else {
876                                 // --0
877                                 return 0;
878                         }
879                 }
880                 else {
881                         if(sd[2] > 0) {
882                                 // -0+
883                                 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
884                                 VECCOPY(q1, v1);
885                                 VECCOPY(q2, v2);
886                                 VECCOPY(q3, q2);
887                         }
888                         else if(sd[2] < 0) {
889                                 // -0-
890                                 return 0;
891                         }
892                         else {
893                                 // -00
894                                 return 0;
895                         }
896                 }
897         }
898         else {
899                 if(sd[1] > 0) {
900                         if(sd[2] > 0) {
901                                 // 0++
902                                 VECCOPY(q0, v0);
903                                 VECCOPY(q1, v1);
904                                 VECCOPY(q2, v2);
905                                 VECCOPY(q3, q2);
906                         }
907                         else if(sd[2] < 0) {
908                                 // 0+-
909                                 VECCOPY(q0, v0);
910                                 VECCOPY(q1, v1);
911                                 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
912                                 VECCOPY(q3, q2);
913                         }
914                         else {
915                                 // 0+0
916                                 VECCOPY(q0, v0);
917                                 VECCOPY(q1, v1);
918                                 VECCOPY(q2, v2);
919                                 VECCOPY(q3, q2);
920                         }
921                 }
922                 else if(sd[1] < 0) {
923                         if(sd[2] > 0) {
924                                 // 0-+
925                                 VECCOPY(q0, v0);
926                                 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
927                                 VECCOPY(q2, v2);
928                                 VECCOPY(q3, q2);
929                         }
930                         else if(sd[2] < 0) {
931                                 // 0--
932                                 return 0;
933                         }
934                         else {
935                                 // 0-0
936                                 return 0;
937                         }
938                 }
939                 else {
940                         if(sd[2] > 0) {
941                                 // 00+
942                                 VECCOPY(q0, v0);
943                                 VECCOPY(q1, v1);
944                                 VECCOPY(q2, v2);
945                                 VECCOPY(q3, q2);
946                         }
947                         else if(sd[2] < 0) {
948                                 // 00-
949                                 return 0;
950                         }
951                         else {
952                                 // 000
953                                 return 0;
954                         }
955                 }
956         }
957
958         return 1;
959 }
960
961 /* altivec optimization, this works, but is unused */
962
963 #if 0
964 #include <Accelerate/Accelerate.h>
965
966 typedef union {
967         vFloat v;
968         float f[4];
969 } vFloatResult;
970
971 static vFloat vec_splat_float(float val)
972 {
973         return (vFloat){val, val, val, val};
974 }
975
976 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
977 {
978         vFloat vcos, rlen, vrx, vry, vrz, vsrx, vsry, vsrz, gx, gy, gz, vangle;
979         vUInt8 rotate = (vUInt8){4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3};
980         vFloatResult vresult;
981         float result;
982
983         /* compute r* */
984         vrx = (vFloat){q0[0], q1[0], q2[0], q3[0]} - vec_splat_float(p[0]);
985         vry = (vFloat){q0[1], q1[1], q2[1], q3[1]} - vec_splat_float(p[1]);
986         vrz = (vFloat){q0[2], q1[2], q2[2], q3[2]} - vec_splat_float(p[2]);
987
988         /* normalize r* */
989         rlen = vec_rsqrte(vrx*vrx + vry*vry + vrz*vrz + vec_splat_float(1e-16f));
990         vrx = vrx*rlen;
991         vry = vry*rlen;
992         vrz = vrz*rlen;
993
994         /* rotate r* for cross and dot */
995         vsrx= vec_perm(vrx, vrx, rotate);
996         vsry= vec_perm(vry, vry, rotate);
997         vsrz= vec_perm(vrz, vrz, rotate);
998
999         /* cross product */
1000         gx = vsry*vrz - vsrz*vry;
1001         gy = vsrz*vrx - vsrx*vrz;
1002         gz = vsrx*vry - vsry*vrx;
1003
1004         /* normalize */
1005         rlen = vec_rsqrte(gx*gx + gy*gy + gz*gz + vec_splat_float(1e-16f));
1006         gx = gx*rlen;
1007         gy = gy*rlen;
1008         gz = gz*rlen;
1009
1010         /* angle */
1011         vcos = vrx*vsrx + vry*vsry + vrz*vsrz;
1012         vcos= vec_max(vec_min(vcos, vec_splat_float(1.0f)), vec_splat_float(-1.0f));
1013         vangle= vacosf(vcos);
1014
1015         /* dot */
1016         vresult.v = (vec_splat_float(n[0])*gx +
1017                      vec_splat_float(n[1])*gy +
1018                      vec_splat_float(n[2])*gz)*vangle;
1019
1020         result= (vresult.f[0] + vresult.f[1] + vresult.f[2] + vresult.f[3])*(0.5f/(float)M_PI);
1021         result= MAX2(result, 0.0f);
1022
1023         return result;
1024 }
1025
1026 #endif
1027
1028 /* SSE optimization, acos code doesn't work */
1029
1030 #if 0
1031
1032 #include <xmmintrin.h>
1033
1034 static __m128 sse_approx_acos(__m128 x)
1035 {
1036         /* needs a better approximation than taylor expansion of acos, since that
1037          * gives big erros for near 1.0 values, sqrt(2*x)*acos(1-x) should work
1038          * better, see http://www.tom.womack.net/projects/sse-fast-arctrig.html */
1039
1040         return _mm_set_ps1(1.0f);
1041 }
1042
1043 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1044 {
1045         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1046         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1047         float fresult[4] __attribute__((aligned(16)));
1048         __m128 qx, qy, qz, rx, ry, rz, rlen, srx, sry, srz, gx, gy, gz, glen, rcos, angle, aresult;
1049
1050         /* compute r */
1051         qx = _mm_set_ps(q3[0], q2[0], q1[0], q0[0]);
1052         qy = _mm_set_ps(q3[1], q2[1], q1[1], q0[1]);
1053         qz = _mm_set_ps(q3[2], q2[2], q1[2], q0[2]);
1054
1055         rx = qx - _mm_set_ps1(p[0]);
1056         ry = qy - _mm_set_ps1(p[1]);
1057         rz = qz - _mm_set_ps1(p[2]);
1058
1059         /* normalize r */
1060         rlen = _mm_rsqrt_ps(rx*rx + ry*ry + rz*rz + _mm_set_ps1(1e-16f));
1061         rx = rx*rlen;
1062         ry = ry*rlen;
1063         rz = rz*rlen;
1064
1065         /* cross product */
1066         srx = _mm_shuffle_ps(rx, rx, _MM_SHUFFLE(0,3,2,1));
1067         sry = _mm_shuffle_ps(ry, ry, _MM_SHUFFLE(0,3,2,1));
1068         srz = _mm_shuffle_ps(rz, rz, _MM_SHUFFLE(0,3,2,1));
1069
1070         gx = sry*rz - srz*ry;
1071         gy = srz*rx - srx*rz;
1072         gz = srx*ry - sry*rx;
1073
1074         /* normalize g */
1075         glen = _mm_rsqrt_ps(gx*gx + gy*gy + gz*gz + _mm_set_ps1(1e-16f));
1076         gx = gx*glen;
1077         gy = gy*glen;
1078         gz = gz*glen;
1079
1080         /* compute angle */
1081         rcos = rx*srx + ry*sry + rz*srz;
1082         rcos= _mm_max_ps(_mm_min_ps(rcos, _mm_set_ps1(1.0f)), _mm_set_ps1(-1.0f));
1083
1084         angle = sse_approx_cos(rcos);
1085         aresult = (_mm_set_ps1(n[0])*gx + _mm_set_ps1(n[1])*gy + _mm_set_ps1(n[2])*gz)*angle;
1086
1087         /* sum together */
1088     result= (fresult[0] + fresult[1] + fresult[2] + fresult[3])*(0.5f/(float)M_PI);
1089         result= MAX2(result, 0.0f);
1090
1091         return result;
1092 }
1093
1094 #endif
1095
1096 static float saacosf(float fac)
1097 {
1098         if(fac<= -1.0f) return (float)M_PI;
1099         else if(fac>=1.0f) return 0.0f;
1100         else return acos(fac); /* acosf(fac) */
1101 }
1102
1103 static void normalizef(float *n)
1104 {
1105         float d;
1106         
1107         d= INPR(n, n);
1108
1109         if(d > 1.0e-35F) {
1110                 d= 1.0f/sqrt(d); /* sqrtf(d) */
1111
1112                 n[0] *= d; 
1113                 n[1] *= d; 
1114                 n[2] *= d;
1115         } 
1116 }
1117
1118 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1119 {
1120         float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1121         float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1122
1123         VECSUB(r0, q0, p);
1124         VECSUB(r1, q1, p);
1125         VECSUB(r2, q2, p);
1126         VECSUB(r3, q3, p);
1127
1128         normalizef(r0);
1129         normalizef(r1);
1130         normalizef(r2);
1131         normalizef(r3);
1132
1133         Crossf(g0, r1, r0); normalizef(g0);
1134         Crossf(g1, r2, r1); normalizef(g1);
1135         Crossf(g2, r3, r2); normalizef(g2);
1136         Crossf(g3, r0, r3); normalizef(g3);
1137
1138         a1= saacosf(INPR(r0, r1));
1139         a2= saacosf(INPR(r1, r2));
1140         a3= saacosf(INPR(r2, r3));
1141         a4= saacosf(INPR(r3, r0));
1142
1143         dot1= INPR(n, g0);
1144         dot2= INPR(n, g1);
1145         dot3= INPR(n, g2);
1146         dot4= INPR(n, g3);
1147
1148         result= (a1*dot1 + a2*dot2 + a3*dot3 + a4*dot4)*0.5f/(float)M_PI;
1149         result= MAX2(result, 0.0f);
1150
1151         return result;
1152 }
1153
1154 float occ_form_factor(OccFace *face, float *p, float *n)
1155 {
1156         ObjectInstanceRen *obi;
1157         VlakRen *vlr;
1158         float v1[3], v2[3], v3[3], v4[3], q0[3], q1[3], q2[3], q3[3], contrib= 0.0f;
1159
1160         obi= &R.objectinstance[face->obi];
1161         vlr= RE_findOrAddVlak(obi->obr, face->facenr);
1162
1163         VECCOPY(v1, vlr->v1->co);
1164         VECCOPY(v2, vlr->v2->co);
1165         VECCOPY(v3, vlr->v3->co);
1166
1167         if(obi->flag & R_TRANSFORMED) {
1168                 Mat4MulVecfl(obi->mat, v1);
1169                 Mat4MulVecfl(obi->mat, v2);
1170                 Mat4MulVecfl(obi->mat, v3);
1171         }
1172
1173         if(occ_visible_quad(p, n, v1, v2, v3, q0, q1, q2, q3))
1174                 contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1175
1176         if(vlr->v4) {
1177                 VECCOPY(v4, vlr->v4->co);
1178                 if(obi->flag & R_TRANSFORMED)
1179                         Mat4MulVecfl(obi->mat, v4);
1180
1181                 if(occ_visible_quad(p, n, v1, v3, v4, q0, q1, q2, q3))
1182                         contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1183         }
1184
1185         return contrib;
1186 }
1187
1188 static void occ_lookup(OcclusionTree *tree, int thread, OccFace *exclude, float *pp, float *pn, float *occ, float *bentn)
1189 {
1190         OccNode *node, **stack;
1191         OccFace *face;
1192         float resultocc, v[3], p[3], n[3], co[3], invd2;
1193         float distfac, fac, error, d2, weight, emitarea;
1194         int b, totstack;
1195
1196         /* init variables */
1197         VECCOPY(p, pp);
1198         VECCOPY(n, pn);
1199         VECADDFAC(p, p, n, 1e-4f);
1200
1201         if(bentn)
1202                 VECCOPY(bentn, n);
1203         
1204         error= tree->error;
1205         distfac= tree->distfac;
1206
1207         resultocc= 0.0f;
1208
1209         /* init stack */
1210         stack= tree->stack[thread];
1211         stack[0]= tree->root;
1212         totstack= 1;
1213
1214         while(totstack) {
1215                 /* pop point off the stack */
1216                 node= stack[--totstack];
1217
1218                 VECSUB(v, node->co, p);
1219                 d2= INPR(v, v) + 1e-16f;
1220                 emitarea= MAX2(node->area, node->dco);
1221
1222                 if(d2*error > emitarea) {
1223                         if(distfac != 0.0f) {
1224                                 fac= 1.0f/(1.0f + distfac*d2);
1225                                 if(fac < 0.01f)
1226                                         continue;
1227                         }
1228                         else
1229                                 fac= 1.0f;
1230
1231                         /* accumulate occlusion from spherical harmonics */
1232                         invd2 = 1.0f/sqrt(d2);
1233                         weight= occ_solid_angle(node, v, d2, invd2, n);
1234                         weight *= node->occlusion;
1235
1236                         if(bentn) {
1237                                 bentn[0] -= weight*invd2*v[0];
1238                                 bentn[1] -= weight*invd2*v[1];
1239                                 bentn[2] -= weight*invd2*v[2];
1240                         }
1241
1242                         resultocc += weight*fac;
1243                 }
1244                 else {
1245                         /* traverse into children */
1246                         for(b=0; b<TOTCHILD; b++) {
1247                                 if(node->childflag & (1<<b)) {
1248                                         face= tree->face+node->child[b].face;
1249
1250                                         /* accumulate occlusion with face form factor */
1251                                         if(!exclude || !(face->obi == exclude->obi && face->facenr == exclude->facenr)) {
1252                                                 if(bentn || distfac != 0.0f) {
1253                                                         occ_face(face, co, NULL, NULL); 
1254                                                         VECSUB(v, co, p);
1255                                                         d2= INPR(v, v) + 1e-16f;
1256
1257                                                         fac= (distfac == 0.0f)? 1.0f: 1.0f/(1.0f + distfac*d2);
1258                                                         if(fac < 0.01f)
1259                                                                 continue;
1260                                                 }
1261                                                 else
1262                                                         fac= 1.0f;
1263
1264                                                 weight= occ_form_factor(face, p, n);
1265                                                 weight *= tree->occlusion[node->child[b].face];
1266
1267                                                 if(bentn) {
1268                                                         invd2= 1.0f/sqrt(d2);
1269                                                         bentn[0] -= weight*invd2*v[0];
1270                                                         bentn[1] -= weight*invd2*v[1];
1271                                                         bentn[2] -= weight*invd2*v[2];
1272                                                 }
1273
1274                                                 resultocc += weight*fac;
1275                                         }
1276                                 }
1277                                 else if(node->child[b].node) {
1278                                         /* push child on the stack */
1279                                         stack[totstack++]= node->child[b].node;
1280                                 }
1281                         }
1282                 }
1283         }
1284
1285         if(occ) *occ= resultocc;
1286 }
1287
1288 static void occ_compute_passes(Render *re, OcclusionTree *tree, int totpass)
1289 {
1290         float *occ, co[3], n[3];
1291         int pass, i;
1292         
1293         occ= MEM_callocN(sizeof(float)*tree->totface, "OcclusionPassOcc");
1294
1295         for(pass=0; pass<totpass; pass++) {
1296                 for(i=0; i<tree->totface; i++) {
1297                         occ_face(&tree->face[i], co, n, NULL);
1298                         VecMulf(n, -1.0f);
1299                         VECADDFAC(co, co, n, 1e-8f);
1300
1301                         occ_lookup(tree, 0, &tree->face[i], co, n, &occ[i], NULL);
1302                         if(re->test_break())
1303                                 break;
1304                 }
1305
1306                 if(re->test_break())
1307                         break;
1308
1309                 for(i=0; i<tree->totface; i++) {
1310                         tree->occlusion[i] -= occ[i]; //MAX2(1.0f-occ[i], 0.0f);
1311                         if(tree->occlusion[i] < 0.0f)
1312                                 tree->occlusion[i]= 0.0f;
1313                 }
1314
1315                 occ_sum_occlusion(tree, tree->root);
1316         }
1317
1318         MEM_freeN(occ);
1319 }
1320
1321 static void sample_occ_tree(Render *re, OcclusionTree *tree, OccFace *exclude, float *co, float *n, int thread, int onlyshadow, float *skycol)
1322 {
1323         float nn[3], bn[3], dxyview[3], fac, occ, occlusion, correction;
1324         int aocolor;
1325
1326         aocolor= re->wrld.aocolor;
1327         if(onlyshadow)
1328                 aocolor= WO_AOPLAIN;
1329
1330         VECCOPY(nn, n);
1331         VecMulf(nn, -1.0f);
1332
1333         occ_lookup(tree, thread, exclude, co, nn, &occ, (aocolor)? bn: NULL);
1334
1335         correction= re->wrld.ao_approx_correction;
1336
1337         occlusion= (1.0f-correction)*(1.0f-occ);
1338         CLAMP(occlusion, 0.0f, 1.0f);
1339         if(correction != 0.0f)
1340                 occlusion += correction*exp(-occ);
1341
1342         if(aocolor) {
1343                 /* sky shading using bent normal */
1344                 if(aocolor==WO_AOSKYCOL) {
1345                         fac= 0.5*(1.0f+bn[0]*re->grvec[0]+ bn[1]*re->grvec[1]+ bn[2]*re->grvec[2]);
1346                         skycol[0]= (1.0f-fac)*re->wrld.horr + fac*re->wrld.zenr;
1347                         skycol[1]= (1.0f-fac)*re->wrld.horg + fac*re->wrld.zeng;
1348                         skycol[2]= (1.0f-fac)*re->wrld.horb + fac*re->wrld.zenb;
1349                 }
1350                 else {  /* WO_AOSKYTEX */
1351                         dxyview[0]= 1.0f;
1352                         dxyview[1]= 1.0f;
1353                         dxyview[2]= 0.0f;
1354                         shadeSkyView(skycol, co, bn, dxyview);
1355                 }
1356
1357                 VecMulf(skycol, occlusion);
1358         }
1359         else {
1360                 skycol[0]= occlusion;
1361                 skycol[1]= occlusion;
1362                 skycol[2]= occlusion;
1363         }
1364 }
1365
1366 /* ---------------------------- Caching ------------------------------- */
1367
1368 static OcclusionCacheSample *find_occ_sample(OcclusionCache *cache, int x, int y)
1369 {
1370         x -= cache->x;
1371         y -= cache->y;
1372
1373         x /= cache->step;
1374         y /= cache->step;
1375         x *= cache->step;
1376         y *= cache->step;
1377
1378         if(x < 0 || x >= cache->w || y < 0 || y >= cache->h)
1379                 return NULL;
1380         else
1381                 return &cache->sample[y*cache->w + x];
1382 }
1383
1384 static int sample_occ_cache(OcclusionTree *tree, float *co, float *n, int x, int y, int thread, float *col)
1385 {
1386         OcclusionCache *cache;
1387         OcclusionCacheSample *samples[4], *sample;
1388         float wn[4], wz[4], wb[4], tx, ty, w, totw, mino, maxo;
1389         float d[3], dist2;
1390         int i, x1, y1, x2, y2;
1391
1392         if(!tree->cache)
1393                 return 0;
1394         
1395         /* first try to find a sample in the same pixel */
1396         cache= &tree->cache[thread];
1397
1398         if(cache->sample && cache->step) {
1399                 sample= &cache->sample[(y-cache->y)*cache->w + (x-cache->x)];
1400                 if(sample->filled) {
1401                         VECSUB(d, sample->co, co);
1402                         dist2= INPR(d, d);
1403                         if(dist2 < 0.5f*sample->dist2 && INPR(sample->n, n) > 0.98f) {
1404                                 VECCOPY(col, sample->col);
1405                                 return 1;
1406                         }
1407                 }
1408         }
1409         else
1410                 return 0;
1411
1412         /* try to interpolate between 4 neighbouring pixels */
1413         samples[0]= find_occ_sample(cache, x, y);
1414         samples[1]= find_occ_sample(cache, x+cache->step, y);
1415         samples[2]= find_occ_sample(cache, x, y+cache->step);
1416         samples[3]= find_occ_sample(cache, x+cache->step, y+cache->step);
1417
1418         for(i=0; i<4; i++)
1419                 if(!samples[i] || !samples[i]->filled)
1420                         return 0;
1421
1422         /* require intensities not being too different */
1423         mino= MIN4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1424         maxo= MAX4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1425
1426         if(maxo - mino > 0.05f)
1427                 return 0;
1428
1429         /* compute weighted interpolation between samples */
1430         col[0]= col[1]= col[2]= 0.0f;
1431         totw= 0.0f;
1432
1433         x1= samples[0]->x;
1434         y1= samples[0]->y;
1435         x2= samples[3]->x;
1436         y2= samples[3]->y;
1437
1438         tx= (float)(x2 - x)/(float)(x2 - x1);
1439         ty= (float)(y2 - y)/(float)(y2 - y1);
1440
1441         wb[3]= (1.0f-tx)*(1.0f-ty);
1442         wb[2]= (tx)*(1.0f-ty);
1443         wb[1]= (1.0f-tx)*(ty);
1444         wb[0]= tx*ty;
1445
1446         for(i=0; i<4; i++) {
1447                 VECSUB(d, samples[i]->co, co);
1448                 dist2= INPR(d, d);
1449
1450                 wz[i]= 1.0f; //(samples[i]->dist2/(1e-4f + dist2));
1451                 wn[i]= pow(INPR(samples[i]->n, n), 32.0f);
1452
1453                 w= wb[i]*wn[i]*wz[i];
1454
1455                 totw += w;
1456                 col[0] += w*samples[i]->col[0];
1457                 col[1] += w*samples[i]->col[1];
1458                 col[2] += w*samples[i]->col[2];
1459         }
1460
1461         if(totw >= 0.9f) {
1462                 totw= 1.0f/totw;
1463                 col[0] *= totw;
1464                 col[1] *= totw;
1465                 col[2] *= totw;
1466                 return 1;
1467         }
1468
1469         return 0;
1470 }
1471
1472 static void sample_occ_surface(ShadeInput *shi)
1473 {
1474         StrandRen *strand= shi->strand;
1475         StrandSurface *mesh= strand->buffer->surface;
1476         int *face, *index = RE_strandren_get_face(shi->obr, strand, 0);
1477         float w[4], *co1, *co2, *co3, *co4;
1478
1479         if(mesh && index) {
1480                 face= mesh->face[*index];
1481
1482                 co1= mesh->co[face[0]];
1483                 co2= mesh->co[face[1]];
1484                 co3= mesh->co[face[2]];
1485                 co4= (face[3])? mesh->co[face[3]]: NULL;
1486
1487                 InterpWeightsQ3Dfl(co1, co2, co3, co4, strand->vert->co, w);
1488
1489                 shi->ao[0]= shi->ao[1]= shi->ao[2]= 0.0f;
1490                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[0]], w[0]);
1491                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[1]], w[1]);
1492                 VECADDFAC(shi->ao, shi->ao, mesh->col[face[2]], w[2]);
1493                 if(face[3])
1494                         VECADDFAC(shi->ao, shi->ao, mesh->col[face[3]], w[3]);
1495         }
1496         else {
1497                 shi->ao[0]= 1.0f;
1498                 shi->ao[1]= 1.0f;
1499                 shi->ao[2]= 1.0f;
1500         }
1501 }
1502
1503 /* ------------------------- External Functions --------------------------- */
1504
1505 static void *exec_strandsurface_sample(void *data)
1506 {
1507         OcclusionThread *othread= (OcclusionThread*)data;
1508         Render *re= othread->re;
1509         StrandSurface *mesh= othread->mesh;
1510         float col[3], co[3], n[3], *co1, *co2, *co3, *co4;
1511         int a, *face;
1512
1513         for(a=othread->begin; a<othread->end; a++) {
1514                 face= mesh->face[a];
1515                 co1= mesh->co[face[0]];
1516                 co2= mesh->co[face[1]];
1517                 co3= mesh->co[face[2]];
1518
1519                 if(face[3]) {
1520                         co4= mesh->co[face[3]];
1521
1522                         VecLerpf(co, co1, co3, 0.5f);
1523                         CalcNormFloat4(co1, co2, co3, co4, n);
1524                 }
1525                 else {
1526                         CalcCent3f(co, co1, co2, co3);
1527                         CalcNormFloat(co1, co2, co3, n);
1528                 }
1529                 VecMulf(n, -1.0f);
1530
1531                 sample_occ_tree(re, re->occlusiontree, NULL, co, n, othread->thread, 0, col);
1532                 VECCOPY(othread->facecol[a], col);
1533         }
1534
1535         return 0;
1536 }
1537
1538 void make_occ_tree(Render *re)
1539 {
1540         OcclusionThread othreads[BLENDER_MAX_THREADS];
1541         StrandSurface *mesh;
1542         ListBase threads;
1543         float col[3], (*facecol)[3];
1544         int a, totface, totthread, *face, *count;
1545
1546         /* ugly, needed for occ_face */
1547         R= *re;
1548
1549         re->i.infostr= "Occlusion preprocessing";
1550         re->stats_draw(&re->i);
1551         
1552         re->occlusiontree= occ_tree_build(re);
1553         
1554         if(re->occlusiontree) {
1555                 if(re->wrld.ao_approx_passes)
1556                         occ_compute_passes(re, re->occlusiontree, re->wrld.ao_approx_passes);
1557
1558                 for(mesh=re->strandsurface.first; mesh; mesh=mesh->next) {
1559                         count= MEM_callocN(sizeof(int)*mesh->totvert, "OcclusionCount");
1560                         facecol= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceCol");
1561
1562                         totthread= (mesh->totface > 10000)? re->r.threads: 1;
1563                         totface= mesh->totface/totthread;
1564                         for(a=0; a<totthread; a++) {
1565                                 othreads[a].re= re;
1566                                 othreads[a].facecol= facecol;
1567                                 othreads[a].thread= a;
1568                                 othreads[a].mesh= mesh;
1569                                 othreads[a].begin= a*totface;
1570                                 othreads[a].end= (a == totthread-1)? mesh->totface: (a+1)*totface;
1571                         }
1572
1573                         if(totthread == 1) {
1574                                 exec_strandsurface_sample(&othreads[0]);
1575                         }
1576                         else {
1577                                 BLI_init_threads(&threads, exec_strandsurface_sample, totthread);
1578
1579                                 for(a=0; a<totthread; a++)
1580                                         BLI_insert_thread(&threads, &othreads[a]);
1581
1582                                 BLI_end_threads(&threads);
1583                         }
1584
1585                         for(a=0; a<mesh->totface; a++) {
1586                                 face= mesh->face[a];
1587
1588                                 VECCOPY(col, facecol[a]);
1589                                 VECADD(mesh->col[face[0]], mesh->col[face[0]], col);
1590                                 count[face[0]]++;
1591                                 VECADD(mesh->col[face[1]], mesh->col[face[1]], col);
1592                                 count[face[1]]++;
1593                                 VECADD(mesh->col[face[2]], mesh->col[face[2]], col);
1594                                 count[face[2]]++;
1595
1596                                 if(face[3]) {
1597                                         VECADD(mesh->col[face[3]], mesh->col[face[3]], col);
1598                                         count[face[3]]++;
1599                                 }
1600                         }
1601
1602                         for(a=0; a<mesh->totvert; a++)
1603                                 if(count[a])
1604                                         VecMulf(mesh->col[a], 1.0f/count[a]);
1605
1606                         MEM_freeN(count);
1607                         MEM_freeN(facecol);
1608                 }
1609         }
1610 }
1611
1612 void free_occ(Render *re)
1613 {
1614         if(re->occlusiontree) {
1615                 occ_free_tree(re->occlusiontree);
1616                 re->occlusiontree = NULL;
1617         }
1618 }
1619
1620 void sample_occ(Render *re, ShadeInput *shi)
1621 {
1622         OcclusionTree *tree= re->occlusiontree;
1623         OcclusionCache *cache;
1624         OcclusionCacheSample *sample;
1625         OccFace exclude;
1626         int onlyshadow;
1627
1628         if(tree) {
1629                 if(shi->strand) {
1630                         sample_occ_surface(shi);
1631                 }
1632                 /* try to get result from the cache if possible */
1633                 else if(!sample_occ_cache(tree, shi->co, shi->vno, shi->xs, shi->ys, shi->thread, shi->ao)) {
1634                         /* no luck, let's sample the occlusion */
1635                         exclude.obi= shi->obi - re->objectinstance;
1636                         exclude.facenr= shi->vlr->index;
1637                         onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1638                         sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1639
1640                         if(G.rt & 32)
1641                                 shi->ao[2] *= 2.0f;
1642
1643                         /* fill result into sample, each time */
1644                         if(tree->cache) {
1645                                 cache= &tree->cache[shi->thread];
1646
1647                                 if(cache->sample && cache->step) {
1648                                         sample= &cache->sample[(shi->ys-cache->y)*cache->w + (shi->xs-cache->x)];
1649                                         VECCOPY(sample->co, shi->co);
1650                                         VECCOPY(sample->n, shi->vno);
1651                                         VECCOPY(sample->col, shi->ao);
1652                                         sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1653                                         sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1654                                         sample->filled= 1;
1655                                 }
1656                         }
1657                 }
1658         }
1659         else {
1660                 shi->ao[0]= 1.0f;
1661                 shi->ao[1]= 1.0f;
1662                 shi->ao[2]= 1.0f;
1663         }
1664 }
1665
1666 void cache_occ_samples(Render *re, RenderPart *pa, ShadeSample *ssamp)
1667 {
1668         OcclusionTree *tree= re->occlusiontree;
1669         PixStr ps;
1670         OcclusionCache *cache;
1671         OcclusionCacheSample *sample;
1672         OccFace exclude;
1673         ShadeInput *shi;
1674         int *ro, *rp, *rz, onlyshadow;
1675         int x, y, step = CACHE_STEP;
1676
1677         if(!tree->cache)
1678                 return;
1679
1680         cache= &tree->cache[pa->thread];
1681         cache->w= pa->rectx;
1682         cache->h= pa->recty;
1683         cache->x= pa->disprect.xmin;
1684         cache->y= pa->disprect.ymin;
1685         cache->step= step;
1686         cache->sample= MEM_callocN(sizeof(OcclusionCacheSample)*cache->w*cache->h, "OcclusionCacheSample");
1687         sample= cache->sample;
1688
1689         ps.next= NULL;
1690         ps.mask= (1<<re->osa);
1691
1692         ro= pa->recto;
1693         rp= pa->rectp;
1694         rz= pa->rectz;
1695
1696         /* compute a sample at every step pixels */
1697         for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++) {
1698                 for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, ro++, rp++, rz++, sample++) {
1699                         if((((x - pa->disprect.xmin + step) % step) == 0 || x == pa->disprect.xmax-1) && (((y - pa->disprect.ymin + step) % step) == 0 || y == pa->disprect.ymax-1)) {
1700                                 if(*rp) {
1701                                         ps.obi= *ro;
1702                                         ps.facenr= *rp;
1703                                         ps.z= *rz;
1704
1705                                         shade_samples_fill_with_ps(ssamp, &ps, x, y);
1706                                         shi= ssamp->shi;
1707                                         if(!shi->vlr)
1708                                                 continue;
1709
1710                                         onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1711                                         exclude.obi= shi->obi - re->objectinstance;
1712                                         exclude.facenr= shi->vlr->index;
1713                                         sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1714
1715                                         VECCOPY(sample->co, shi->co);
1716                                         VECCOPY(sample->n, shi->vno);
1717                                         VECCOPY(sample->col, shi->ao);
1718                                         sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1719                                         sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1720                                         sample->x= shi->xs;
1721                                         sample->y= shi->ys;
1722                                         sample->filled= 1;
1723                                 }
1724
1725                                 if(re->test_break())
1726                                         break;
1727                         }
1728                 }
1729         }
1730 }
1731
1732 void free_occ_samples(Render *re, RenderPart *pa)
1733 {
1734         OcclusionTree *tree= re->occlusiontree;
1735         OcclusionCache *cache;
1736
1737         if(tree->cache) {
1738                 cache= &tree->cache[pa->thread];
1739
1740                 if(cache->sample)
1741                         MEM_freeN(cache->sample);
1742
1743                 cache->w= 0;
1744                 cache->h= 0;
1745                 cache->step= 0;
1746         }
1747 }
1748