4 * ***** BEGIN GPL LICENSE BLOCK *****
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software Foundation,
18 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 * The Original Code is Copyright (C) 2008 Blender Foundation.
21 * All rights reserved.
23 * The Original Code is: all of this file.
25 * Contributor(s): Brecht Van Lommel.
27 * ***** END GPL LICENSE BLOCK *****
35 #include "MEM_guardedalloc.h"
37 #include "DNA_material_types.h"
39 #include "BLI_arithb.h"
40 #include "BLI_blenlib.h"
41 #include "BLI_memarena.h"
42 #include "BLI_threads.h"
44 #include "BKE_global.h"
45 #include "BKE_scene.h"
46 #include "BKE_utildefines.h"
48 #include "RE_shader_ext.h"
51 #include "occlusion.h"
52 #include "render_types.h"
53 #include "rendercore.h"
54 #include "renderdatabase.h"
55 #include "pixelshading.h"
59 /* ------------------------- Declarations --------------------------- */
61 #define INVALID_INDEX ((int)(~0))
62 #define INVPI 0.31830988618379069f
66 typedef struct OcclusionCacheSample {
67 float co[3], n[3], col[3], intensity, dist2;
69 } OcclusionCacheSample;
71 typedef struct OcclusionCache {
72 OcclusionCacheSample *sample;
76 typedef struct OccFace {
81 typedef struct OccNode {
93 typedef struct OcclusionTree {
96 float (*co)[3]; /* temporary during build */
98 OccFace *face; /* instance and face indices */
99 float *occlusion; /* occlusion for faces */
103 OccNode **stack[BLENDER_MAX_THREADS];
114 OcclusionCache *cache;
117 typedef struct OcclusionThread {
125 typedef struct OcclusionBuildThread {
127 int begin, end, depth;
129 } OcclusionBuildThread;
131 /* ------------------------- Shading --------------------------- */
133 extern Render R; // meh
136 static void occ_shade(ShadeSample *ssamp, ObjectInstanceRen *obi, VlakRen *vlr, float *rad)
138 ShadeInput *shi= ssamp->shi;
139 ShadeResult *shr= ssamp->shr;
140 float l, u, v, *v1, *v2, *v3;
148 shi->u= u= 1.0f/3.0f;
149 shi->v= v= 1.0f/3.0f;
152 /* setup render coordinates */
160 shi->co[0]= l*v3[0]+u*v1[0]+v*v2[0];
161 shi->co[1]= l*v3[1]+u*v1[1]+v*v2[1];
162 shi->co[2]= l*v3[2]+u*v1[2]+v*v2[2];
164 shade_input_set_triangle_i(shi, obi, vlr, 0, 1, 2);
166 /* set up view vector */
167 VECCOPY(shi->view, shi->co);
168 Normalize(shi->view);
170 /* cache for shadow */
176 shade_input_set_normals(shi);
180 shade_input_flip_normals(shi);
182 /* not a pretty solution, but fixes common cases */
183 if(shi->obr->ob && shi->obr->ob->transflag & OB_NEG_SCALE) {
188 /* init material vars */
189 // note, keep this synced with render_types.h
190 memcpy(&shi->r, &shi->mat->r, 23*sizeof(float));
191 shi->har= shi->mat->har;
194 shade_input_set_shade_texco(shi);
195 shade_material_loop(shi, shr); /* todo: nodes */
197 VECCOPY(rad, shr->combined);
200 static void occ_build_shade(Render *re, OcclusionTree *tree)
203 ObjectInstanceRen *obi;
209 /* setup shade sample with correct passes */
210 memset(&ssamp, 0, sizeof(ShadeSample));
211 ssamp.shi[0].lay= re->scene->lay;
212 ssamp.shi[0].passflag= SCE_PASS_DIFFUSE|SCE_PASS_RGBA;
213 ssamp.shi[0].combinedflag= ~(SCE_PASS_SPEC);
216 for(a=0; a<tree->totface; a++) {
217 obi= &R.objectinstance[tree->face[a].obi];
218 vlr= RE_findOrAddVlak(obi->obr, tree->face[a].vlr);
220 occ_shade(&ssamp, obi, vlr, tree->rad[a]);
225 /* ------------------------- Spherical Harmonics --------------------------- */
227 /* Use 2nd order SH => 9 coefficients, stored in this order:
229 1 = (1,-1), 2 = (1,0), 3 = (1,1),
230 4 = (2,-2), 5 = (2,-1), 6 = (2,0), 7 = (2,1), 8 = (2,2) */
232 static void sh_copy(float *shresult, float *sh)
234 memcpy(shresult, sh, sizeof(float)*9);
237 static void sh_mul(float *sh, float f)
245 static void sh_add(float *shresult, float *sh1, float *sh2)
250 shresult[i]= sh1[i] + sh2[i];
253 static void sh_from_disc(float *n, float area, float *shresult)
255 /* See formula (3) in:
256 "An Efficient Representation for Irradiance Environment Maps" */
257 float sh[9], x, y, z;
269 sh[4]= 1.092548f*x*y;
270 sh[5]= 1.092548f*y*z;
271 sh[6]= 0.315392f*(3.0f*z*z - 1.0f);
272 sh[7]= 1.092548f*x*z;
273 sh[8]= 0.546274f*(x*x - y*y);
276 sh_copy(shresult, sh);
279 static float sh_eval(float *sh, float *v)
281 /* See formula (13) in:
282 "An Efficient Representation for Irradiance Environment Maps" */
283 static const float c1 = 0.429043f, c2 = 0.511664f, c3 = 0.743125f;
284 static const float c4 = 0.886227f, c5 = 0.247708f;
291 sum= c1*sh[8]*(x*x - y*y);
295 sum += 2.0f*c1*(sh[4]*x*y + sh[7]*x*z + sh[5]*y*z);
296 sum += 2.0f*c2*(sh[3]*x + sh[1]*y + sh[2]*z);
301 /* ------------------------------ Building --------------------------------- */
303 static void occ_face(const OccFace *face, float *co, float *normal, float *area)
305 ObjectInstanceRen *obi;
307 float v1[3], v2[3], v3[3], v4[3];
309 obi= &R.objectinstance[face->obi];
310 vlr= RE_findOrAddVlak(obi->obr, face->facenr);
314 VecLerpf(co, vlr->v1->co, vlr->v3->co, 0.5f);
316 CalcCent3f(co, vlr->v1->co, vlr->v2->co, vlr->v3->co);
318 if(obi->flag & R_TRANSFORMED)
319 Mat4MulVecfl(obi->mat, co);
323 normal[0]= -vlr->n[0];
324 normal[1]= -vlr->n[1];
325 normal[2]= -vlr->n[2];
327 if(obi->flag & R_TRANSFORMED)
328 Mat3MulVecfl(obi->nmat, normal);
332 VECCOPY(v1, vlr->v1->co);
333 VECCOPY(v2, vlr->v2->co);
334 VECCOPY(v3, vlr->v3->co);
335 if(vlr->v4) VECCOPY(v4, vlr->v4->co);
337 if(obi->flag & R_TRANSFORMED) {
338 Mat4MulVecfl(obi->mat, v1);
339 Mat4MulVecfl(obi->mat, v2);
340 Mat4MulVecfl(obi->mat, v3);
341 if(vlr->v4) Mat4MulVecfl(obi->mat, v4);
344 /* todo: correct area for instances */
346 *area= AreaQ3Dfl(v1, v2, v3, v4);
348 *area= AreaT3Dfl(v1, v2, v3);
352 static void occ_sum_occlusion(OcclusionTree *tree, OccNode *node)
355 float occ, area, totarea;
361 for(b=0; b<TOTCHILD; b++) {
362 if(node->childflag & (1<<b)) {
363 a= node->child[b].face;
364 occ_face(&tree->face[a], 0, 0, &area);
365 occ += area*tree->occlusion[a];
368 else if(node->child[b].node) {
369 child= node->child[b].node;
370 occ_sum_occlusion(tree, child);
372 occ += child->area*child->occlusion;
373 totarea += child->area;
380 node->occlusion= occ;
383 static int occ_find_bbox_axis(OcclusionTree *tree, int begin, int end, float *min, float *max)
385 float len, maxlen= -1.0f;
388 INIT_MINMAX(min, max);
390 for(a=begin; a<end; a++)
391 DO_MINMAX(tree->co[a], min, max)
394 len= max[a] - min[a];
405 static void occ_node_from_face(OccFace *face, OccNode *node)
409 occ_face(face, node->co, n, &node->area);
411 sh_from_disc(n, node->area, node->sh);
414 static void occ_build_dco(OcclusionTree *tree, OccNode *node, float *co, float *dco)
417 float dist, d[3], nco[3];
420 for(b=0; b<TOTCHILD; b++) {
421 if(node->childflag & (1<<b)) {
422 occ_face(tree->face+node->child[b].face, nco, 0, 0);
424 else if(node->child[b].node) {
425 child= node->child[b].node;
426 occ_build_dco(tree, child, co, dco);
427 VECCOPY(nco, child->co);
437 static void occ_build_split(OcclusionTree *tree, int begin, int end, int *split)
439 float min[3], max[3], mid;
442 /* split in middle of boundbox. this seems faster than median split
443 * on complex scenes, possibly since it avoids two distant faces to
444 * be in the same node better? */
445 axis= occ_find_bbox_axis(tree, begin, end, min, max);
446 mid= 0.5f*(min[axis]+max[axis]);
451 if(tree->co[a][axis] > mid) {
453 SWAP(OccFace, tree->face[a], tree->face[enda]);
454 SWAP(float, tree->co[a][0], tree->co[enda][0]);
455 SWAP(float, tree->co[a][1], tree->co[enda][1]);
456 SWAP(float, tree->co[a][2], tree->co[enda][2]);
465 static void occ_build_8_split(OcclusionTree *tree, int begin, int end, int *offset, int *count)
467 /* split faces into eight groups */
468 int b, splitx, splity[2], splitz[4];
470 occ_build_split(tree, begin, end, &splitx);
472 /* force split if none found, to deal with degenerate geometry */
473 if(splitx == begin || splitx == end)
474 splitx= (begin+end)/2;
476 occ_build_split(tree, begin, splitx, &splity[0]);
477 occ_build_split(tree, splitx, end, &splity[1]);
479 occ_build_split(tree, begin, splity[0], &splitz[0]);
480 occ_build_split(tree, splity[0], splitx, &splitz[1]);
481 occ_build_split(tree, splitx, splity[1], &splitz[2]);
482 occ_build_split(tree, splity[1], end, &splitz[3]);
485 offset[1]= splitz[0];
486 offset[2]= splity[0];
487 offset[3]= splitz[1];
489 offset[5]= splitz[2];
490 offset[6]= splity[1];
491 offset[7]= splitz[3];
494 count[b]= offset[b+1] - offset[b];
495 count[7]= end - offset[7];
498 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth);
500 static void *exec_occ_build(void *data)
502 OcclusionBuildThread *othread= (OcclusionBuildThread*)data;
504 occ_build_recursive(othread->tree, othread->node, othread->begin, othread->end, othread->depth);
509 static void occ_build_recursive(OcclusionTree *tree, OccNode *node, int begin, int end, int depth)
512 OcclusionBuildThread othreads[BLENDER_MAX_THREADS];
513 OccNode *child, tmpnode;
515 int a, b, totthread=0, offset[TOTCHILD], count[TOTCHILD];
518 node->occlusion= 1.0f;
520 /* leaf node with only children */
521 if(end - begin <= TOTCHILD) {
522 for(a=begin, b=0; a<end; a++, b++) {
523 face= &tree->face[a];
524 node->child[b].face= a;
525 node->childflag |= (1<<b);
530 occ_build_8_split(tree, begin, end, offset, count);
532 if(depth == 1 && tree->dothreadedbuild)
533 BLI_init_threads(&threads, exec_occ_build, tree->totbuildthread);
535 for(b=0; b<TOTCHILD; b++) {
537 node->child[b].node= NULL;
539 else if(count[b] == 1) {
540 face= &tree->face[offset[b]];
541 node->child[b].face= offset[b];
542 node->childflag |= (1<<b);
545 if(tree->dothreadedbuild)
546 BLI_lock_thread(LOCK_CUSTOM1);
548 child= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
549 node->child[b].node= child;
551 /* keep track of maximum depth for stack */
552 if(depth+1 > tree->maxdepth)
553 tree->maxdepth= depth+1;
555 if(tree->dothreadedbuild)
556 BLI_unlock_thread(LOCK_CUSTOM1);
558 if(depth == 1 && tree->dothreadedbuild) {
559 othreads[totthread].tree= tree;
560 othreads[totthread].node= child;
561 othreads[totthread].begin= offset[b];
562 othreads[totthread].end= offset[b]+count[b];
563 othreads[totthread].depth= depth+1;
564 BLI_insert_thread(&threads, &othreads[totthread]);
568 occ_build_recursive(tree, child, offset[b], offset[b]+count[b], depth+1);
572 if(depth == 1 && tree->dothreadedbuild)
573 BLI_end_threads(&threads);
576 /* combine area, position and sh */
577 for(b=0; b<TOTCHILD; b++) {
578 if(node->childflag & (1<<b)) {
580 occ_node_from_face(tree->face+node->child[b].face, &tmpnode);
583 child= node->child[b].node;
587 node->area += child->area;
588 sh_add(node->sh, node->sh, child->sh);
589 VECADDFAC(node->co, node->co, child->co, child->area);
593 if(node->area != 0.0f)
594 VecMulf(node->co, 1.0f/node->area);
596 /* compute maximum distance from center */
598 occ_build_dco(tree, node, node->co, &node->dco);
601 static void occ_build_sh_normalize(OccNode *node)
603 /* normalize spherical harmonics to not include area, so
604 * we can clamp the dot product and then mutliply by area */
607 if(node->area != 0.0f)
608 sh_mul(node->sh, 1.0f/node->area);
610 for(b=0; b<TOTCHILD; b++) {
611 if(node->childflag & (1<<b));
612 else if(node->child[b].node)
613 occ_build_sh_normalize(node->child[b].node);
617 static OcclusionTree *occ_tree_build(Render *re)
620 ObjectInstanceRen *obi;
623 int a, b, c, totface;
627 for(obi=re->instancetable.first; obi; obi=obi->next) {
629 for(a=0; a<obr->totvlak; a++) {
630 if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
633 if((vlr->mat->mode & MA_TRACEBLE) && (vlr->mat->material_type == MA_TYPE_SURFACE))
641 tree= MEM_callocN(sizeof(OcclusionTree), "OcclusionTree");
642 tree->totface= totface;
645 tree->error= get_render_aosss_error(&re->r, re->wrld.ao_approx_error);
646 tree->distfac= (re->wrld.aomode & WO_AODIST)? re->wrld.aodistfac: 0.0f;
649 tree->arena= BLI_memarena_new(0x8000 * sizeof(OccNode));
650 BLI_memarena_use_calloc(tree->arena);
652 if(re->wrld.aomode & WO_AOCACHE)
653 tree->cache= MEM_callocN(sizeof(OcclusionCache)*BLENDER_MAX_THREADS, "OcclusionCache");
655 tree->face= MEM_callocN(sizeof(OccFace)*totface, "OcclusionFace");
656 tree->co= MEM_callocN(sizeof(float)*3*totface, "OcclusionCo");
657 tree->occlusion= MEM_callocN(sizeof(float)*totface, "OcclusionOcclusion");
659 /* make array of face pointers */
660 for(b=0, c=0, obi=re->instancetable.first; obi; obi=obi->next, c++) {
662 for(a=0; a<obr->totvlak; a++) {
663 if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
666 if((vlr->mat->mode & MA_TRACEBLE) && (vlr->mat->material_type == MA_TYPE_SURFACE)) {
667 tree->face[b].obi= c;
668 tree->face[b].facenr= a;
669 tree->occlusion[b]= 1.0f;
670 occ_face(&tree->face[b], tree->co[b], NULL, NULL);
677 tree->totbuildthread= (re->r.threads > 1 && totface > 10000)? 8: 1;
678 tree->dothreadedbuild= (tree->totbuildthread > 1);
681 tree->root= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
683 occ_build_recursive(tree, tree->root, 0, totface, 1);
686 if(tree->doindirect) {
687 occ_build_shade(re, tree);
688 occ_sum_occlusion(tree, tree->root);
695 occ_build_sh_normalize(tree->root);
697 for(a=0; a<BLENDER_MAX_THREADS; a++)
698 tree->stack[a]= MEM_callocN(sizeof(OccNode)*TOTCHILD*(tree->maxdepth+1), "OccStack");
703 static void occ_free_tree(OcclusionTree *tree)
708 if(tree->arena) BLI_memarena_free(tree->arena);
709 for(a=0; a<BLENDER_MAX_THREADS; a++)
711 MEM_freeN(tree->stack[a]);
712 if(tree->occlusion) MEM_freeN(tree->occlusion);
713 if(tree->face) MEM_freeN(tree->face);
714 if(tree->cache) MEM_freeN(tree->cache);
719 /* ------------------------- Traversal --------------------------- */
721 static float occ_solid_angle(OccNode *node, float *v, float d2, float invd2, float *receivenormal)
723 float dotreceive, dotemit;
729 dotemit= sh_eval(node->sh, ev);
730 dotreceive= INPR(receivenormal, v)*invd2;
732 CLAMP(dotemit, 0.0f, 1.0f);
733 CLAMP(dotreceive, 0.0f, 1.0f);
735 return ((node->area*dotemit*dotreceive)/(d2 + node->area*INVPI))*INVPI;
738 static void VecAddDir(float *result, float *v1, float *v2, float fac)
740 result[0]= v1[0] + fac*(v2[0] - v1[0]);
741 result[1]= v1[1] + fac*(v2[1] - v1[1]);
742 result[2]= v1[2] + fac*(v2[2] - v1[2]);
745 static int occ_visible_quad(float *p, float *n, float *v0, float *v1, float *v2, float *q0, float *q1, float *q2, float *q3)
747 static const float epsilon = 1e-6f;
752 /* signed distances from the vertices to the plane. */
753 sd[0]= INPR(n, v0) - c;
754 sd[1]= INPR(n, v1) - c;
755 sd[2]= INPR(n, v2) - c;
757 if(fabsf(sd[0]) < epsilon) sd[0] = 0.0f;
758 if(fabsf(sd[1]) < epsilon) sd[1] = 0.0f;
759 if(fabsf(sd[2]) < epsilon) sd[2] = 0.0f;
774 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
775 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
789 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
790 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
796 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
797 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
803 VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
820 VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
836 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
839 VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
843 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
845 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
850 VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
859 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
860 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
876 VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
904 VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
919 VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
954 /* altivec optimization, this works, but is unused */
957 #include <Accelerate/Accelerate.h>
964 static vFloat vec_splat_float(float val)
966 return (vFloat){val, val, val, val};
969 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
971 vFloat vcos, rlen, vrx, vry, vrz, vsrx, vsry, vsrz, gx, gy, gz, vangle;
972 vUInt8 rotate = (vUInt8){4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3};
973 vFloatResult vresult;
977 vrx = (vFloat){q0[0], q1[0], q2[0], q3[0]} - vec_splat_float(p[0]);
978 vry = (vFloat){q0[1], q1[1], q2[1], q3[1]} - vec_splat_float(p[1]);
979 vrz = (vFloat){q0[2], q1[2], q2[2], q3[2]} - vec_splat_float(p[2]);
982 rlen = vec_rsqrte(vrx*vrx + vry*vry + vrz*vrz + vec_splat_float(1e-16f));
987 /* rotate r* for cross and dot */
988 vsrx= vec_perm(vrx, vrx, rotate);
989 vsry= vec_perm(vry, vry, rotate);
990 vsrz= vec_perm(vrz, vrz, rotate);
993 gx = vsry*vrz - vsrz*vry;
994 gy = vsrz*vrx - vsrx*vrz;
995 gz = vsrx*vry - vsry*vrx;
998 rlen = vec_rsqrte(gx*gx + gy*gy + gz*gz + vec_splat_float(1e-16f));
1004 vcos = vrx*vsrx + vry*vsry + vrz*vsrz;
1005 vcos= vec_max(vec_min(vcos, vec_splat_float(1.0f)), vec_splat_float(-1.0f));
1006 vangle= vacosf(vcos);
1009 vresult.v = (vec_splat_float(n[0])*gx +
1010 vec_splat_float(n[1])*gy +
1011 vec_splat_float(n[2])*gz)*vangle;
1013 result= (vresult.f[0] + vresult.f[1] + vresult.f[2] + vresult.f[3])*(0.5f/(float)M_PI);
1014 result= MAX2(result, 0.0f);
1021 /* SSE optimization, acos code doesn't work */
1025 #include <xmmintrin.h>
1027 static __m128 sse_approx_acos(__m128 x)
1029 /* needs a better approximation than taylor expansion of acos, since that
1030 * gives big erros for near 1.0 values, sqrt(2*x)*acos(1-x) should work
1031 * better, see http://www.tom.womack.net/projects/sse-fast-arctrig.html */
1033 return _mm_set_ps1(1.0f);
1036 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1038 float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1039 float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1040 float fresult[4] __attribute__((aligned(16)));
1041 __m128 qx, qy, qz, rx, ry, rz, rlen, srx, sry, srz, gx, gy, gz, glen, rcos, angle, aresult;
1044 qx = _mm_set_ps(q3[0], q2[0], q1[0], q0[0]);
1045 qy = _mm_set_ps(q3[1], q2[1], q1[1], q0[1]);
1046 qz = _mm_set_ps(q3[2], q2[2], q1[2], q0[2]);
1048 rx = qx - _mm_set_ps1(p[0]);
1049 ry = qy - _mm_set_ps1(p[1]);
1050 rz = qz - _mm_set_ps1(p[2]);
1053 rlen = _mm_rsqrt_ps(rx*rx + ry*ry + rz*rz + _mm_set_ps1(1e-16f));
1059 srx = _mm_shuffle_ps(rx, rx, _MM_SHUFFLE(0,3,2,1));
1060 sry = _mm_shuffle_ps(ry, ry, _MM_SHUFFLE(0,3,2,1));
1061 srz = _mm_shuffle_ps(rz, rz, _MM_SHUFFLE(0,3,2,1));
1063 gx = sry*rz - srz*ry;
1064 gy = srz*rx - srx*rz;
1065 gz = srx*ry - sry*rx;
1068 glen = _mm_rsqrt_ps(gx*gx + gy*gy + gz*gz + _mm_set_ps1(1e-16f));
1074 rcos = rx*srx + ry*sry + rz*srz;
1075 rcos= _mm_max_ps(_mm_min_ps(rcos, _mm_set_ps1(1.0f)), _mm_set_ps1(-1.0f));
1077 angle = sse_approx_cos(rcos);
1078 aresult = (_mm_set_ps1(n[0])*gx + _mm_set_ps1(n[1])*gy + _mm_set_ps1(n[2])*gz)*angle;
1081 result= (fresult[0] + fresult[1] + fresult[2] + fresult[3])*(0.5f/(float)M_PI);
1082 result= MAX2(result, 0.0f);
1089 static void normalizef(float *n)
1104 static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
1106 float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
1107 float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
1119 Crossf(g0, r1, r0); normalizef(g0);
1120 Crossf(g1, r2, r1); normalizef(g1);
1121 Crossf(g2, r3, r2); normalizef(g2);
1122 Crossf(g3, r0, r3); normalizef(g3);
1124 a1= saacosf(INPR(r0, r1));
1125 a2= saacosf(INPR(r1, r2));
1126 a3= saacosf(INPR(r2, r3));
1127 a4= saacosf(INPR(r3, r0));
1134 result= (a1*dot1 + a2*dot2 + a3*dot3 + a4*dot4)*0.5f/(float)M_PI;
1135 result= MAX2(result, 0.0f);
1140 static float occ_form_factor(OccFace *face, float *p, float *n)
1142 ObjectInstanceRen *obi;
1144 float v1[3], v2[3], v3[3], v4[3], q0[3], q1[3], q2[3], q3[3], contrib= 0.0f;
1146 obi= &R.objectinstance[face->obi];
1147 vlr= RE_findOrAddVlak(obi->obr, face->facenr);
1149 VECCOPY(v1, vlr->v1->co);
1150 VECCOPY(v2, vlr->v2->co);
1151 VECCOPY(v3, vlr->v3->co);
1153 if(obi->flag & R_TRANSFORMED) {
1154 Mat4MulVecfl(obi->mat, v1);
1155 Mat4MulVecfl(obi->mat, v2);
1156 Mat4MulVecfl(obi->mat, v3);
1159 if(occ_visible_quad(p, n, v1, v2, v3, q0, q1, q2, q3))
1160 contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1163 VECCOPY(v4, vlr->v4->co);
1164 if(obi->flag & R_TRANSFORMED)
1165 Mat4MulVecfl(obi->mat, v4);
1167 if(occ_visible_quad(p, n, v1, v3, v4, q0, q1, q2, q3))
1168 contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
1174 static void occ_lookup(OcclusionTree *tree, int thread, OccFace *exclude, float *pp, float *pn, float *occ, float *bentn)
1176 OccNode *node, **stack;
1178 float resultocc, v[3], p[3], n[3], co[3], invd2;
1179 float distfac, fac, error, d2, weight, emitarea;
1182 /* init variables */
1185 VECADDFAC(p, p, n, 1e-4f);
1191 distfac= tree->distfac;
1196 stack= tree->stack[thread];
1197 stack[0]= tree->root;
1201 /* pop point off the stack */
1202 node= stack[--totstack];
1204 VECSUB(v, node->co, p);
1205 d2= INPR(v, v) + 1e-16f;
1206 emitarea= MAX2(node->area, node->dco);
1208 if(d2*error > emitarea) {
1209 if(distfac != 0.0f) {
1210 fac= 1.0f/(1.0f + distfac*d2);
1217 /* accumulate occlusion from spherical harmonics */
1218 invd2 = 1.0f/sqrtf(d2);
1219 weight= occ_solid_angle(node, v, d2, invd2, n);
1220 weight *= node->occlusion;
1223 bentn[0] -= weight*invd2*v[0];
1224 bentn[1] -= weight*invd2*v[1];
1225 bentn[2] -= weight*invd2*v[2];
1228 resultocc += weight*fac;
1231 /* traverse into children */
1232 for(b=0; b<TOTCHILD; b++) {
1233 if(node->childflag & (1<<b)) {
1234 face= tree->face+node->child[b].face;
1236 /* accumulate occlusion with face form factor */
1237 if(!exclude || !(face->obi == exclude->obi && face->facenr == exclude->facenr)) {
1238 if(bentn || distfac != 0.0f) {
1239 occ_face(face, co, NULL, NULL);
1241 d2= INPR(v, v) + 1e-16f;
1243 fac= (distfac == 0.0f)? 1.0f: 1.0f/(1.0f + distfac*d2);
1250 weight= occ_form_factor(face, p, n);
1251 weight *= tree->occlusion[node->child[b].face];
1254 invd2= 1.0f/sqrtf(d2);
1255 bentn[0] -= weight*invd2*v[0];
1256 bentn[1] -= weight*invd2*v[1];
1257 bentn[2] -= weight*invd2*v[2];
1260 resultocc += weight*fac;
1263 else if(node->child[b].node) {
1264 /* push child on the stack */
1265 stack[totstack++]= node->child[b].node;
1271 if(occ) *occ= resultocc;
1272 if(bentn) Normalize(bentn);
1275 static void occ_compute_passes(Render *re, OcclusionTree *tree, int totpass)
1277 float *occ, co[3], n[3];
1280 occ= MEM_callocN(sizeof(float)*tree->totface, "OcclusionPassOcc");
1282 for(pass=0; pass<totpass; pass++) {
1283 for(i=0; i<tree->totface; i++) {
1284 occ_face(&tree->face[i], co, n, NULL);
1286 VECADDFAC(co, co, n, 1e-8f);
1288 occ_lookup(tree, 0, &tree->face[i], co, n, &occ[i], NULL);
1289 if(re->test_break(re->tbh))
1293 if(re->test_break(re->tbh))
1296 for(i=0; i<tree->totface; i++) {
1297 tree->occlusion[i] -= occ[i]; //MAX2(1.0f-occ[i], 0.0f);
1298 if(tree->occlusion[i] < 0.0f)
1299 tree->occlusion[i]= 0.0f;
1302 occ_sum_occlusion(tree, tree->root);
1308 static void sample_occ_tree(Render *re, OcclusionTree *tree, OccFace *exclude, float *co, float *n, int thread, int onlyshadow, float *skycol)
1310 float nn[3], bn[3], fac, occ, occlusion, correction;
1313 aocolor= re->wrld.aocolor;
1315 aocolor= WO_AOPLAIN;
1320 occ_lookup(tree, thread, exclude, co, nn, &occ, (aocolor)? bn: NULL);
1322 correction= re->wrld.ao_approx_correction;
1324 occlusion= (1.0f-correction)*(1.0f-occ);
1325 CLAMP(occlusion, 0.0f, 1.0f);
1326 if(correction != 0.0f)
1327 occlusion += correction*exp(-occ);
1330 /* sky shading using bent normal */
1331 if(ELEM(aocolor, WO_AOSKYCOL, WO_AOSKYTEX)) {
1332 fac= 0.5*(1.0f+bn[0]*re->grvec[0]+ bn[1]*re->grvec[1]+ bn[2]*re->grvec[2]);
1333 skycol[0]= (1.0f-fac)*re->wrld.horr + fac*re->wrld.zenr;
1334 skycol[1]= (1.0f-fac)*re->wrld.horg + fac*re->wrld.zeng;
1335 skycol[2]= (1.0f-fac)*re->wrld.horb + fac*re->wrld.zenb;
1338 else { /* WO_AOSKYTEX */
1346 shadeSkyView(skycol, co, bn, dxyview);
1350 VecMulf(skycol, occlusion);
1353 skycol[0]= occlusion;
1354 skycol[1]= occlusion;
1355 skycol[2]= occlusion;
1359 /* ---------------------------- Caching ------------------------------- */
1361 static OcclusionCacheSample *find_occ_sample(OcclusionCache *cache, int x, int y)
1371 if(x < 0 || x >= cache->w || y < 0 || y >= cache->h)
1374 return &cache->sample[y*cache->w + x];
1377 static int sample_occ_cache(OcclusionTree *tree, float *co, float *n, int x, int y, int thread, float *col)
1379 OcclusionCache *cache;
1380 OcclusionCacheSample *samples[4], *sample;
1381 float wn[4], wz[4], wb[4], tx, ty, w, totw, mino, maxo;
1383 int i, x1, y1, x2, y2;
1388 /* first try to find a sample in the same pixel */
1389 cache= &tree->cache[thread];
1391 if(cache->sample && cache->step) {
1392 sample= &cache->sample[(y-cache->y)*cache->w + (x-cache->x)];
1393 if(sample->filled) {
1394 VECSUB(d, sample->co, co);
1396 if(dist2 < 0.5f*sample->dist2 && INPR(sample->n, n) > 0.98f) {
1397 VECCOPY(col, sample->col);
1405 /* try to interpolate between 4 neighbouring pixels */
1406 samples[0]= find_occ_sample(cache, x, y);
1407 samples[1]= find_occ_sample(cache, x+cache->step, y);
1408 samples[2]= find_occ_sample(cache, x, y+cache->step);
1409 samples[3]= find_occ_sample(cache, x+cache->step, y+cache->step);
1412 if(!samples[i] || !samples[i]->filled)
1415 /* require intensities not being too different */
1416 mino= MIN4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1417 maxo= MAX4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
1419 if(maxo - mino > 0.05f)
1422 /* compute weighted interpolation between samples */
1423 col[0]= col[1]= col[2]= 0.0f;
1431 tx= (float)(x2 - x)/(float)(x2 - x1);
1432 ty= (float)(y2 - y)/(float)(y2 - y1);
1434 wb[3]= (1.0f-tx)*(1.0f-ty);
1435 wb[2]= (tx)*(1.0f-ty);
1436 wb[1]= (1.0f-tx)*(ty);
1439 for(i=0; i<4; i++) {
1440 VECSUB(d, samples[i]->co, co);
1443 wz[i]= 1.0f; //(samples[i]->dist2/(1e-4f + dist2));
1444 wn[i]= pow(INPR(samples[i]->n, n), 32.0f);
1446 w= wb[i]*wn[i]*wz[i];
1449 col[0] += w*samples[i]->col[0];
1450 col[1] += w*samples[i]->col[1];
1451 col[2] += w*samples[i]->col[2];
1465 static void sample_occ_surface(ShadeInput *shi)
1467 StrandRen *strand= shi->strand;
1468 StrandSurface *mesh= strand->buffer->surface;
1469 int *face, *index = RE_strandren_get_face(shi->obr, strand, 0);
1470 float w[4], *co1, *co2, *co3, *co4;
1472 if(mesh && mesh->face && mesh->co && mesh->col && index) {
1473 face= mesh->face[*index];
1475 co1= mesh->co[face[0]];
1476 co2= mesh->co[face[1]];
1477 co3= mesh->co[face[2]];
1478 co4= (face[3])? mesh->co[face[3]]: NULL;
1480 InterpWeightsQ3Dfl(co1, co2, co3, co4, strand->vert->co, w);
1482 shi->ao[0]= shi->ao[1]= shi->ao[2]= 0.0f;
1483 VECADDFAC(shi->ao, shi->ao, mesh->col[face[0]], w[0]);
1484 VECADDFAC(shi->ao, shi->ao, mesh->col[face[1]], w[1]);
1485 VECADDFAC(shi->ao, shi->ao, mesh->col[face[2]], w[2]);
1487 VECADDFAC(shi->ao, shi->ao, mesh->col[face[3]], w[3]);
1496 /* ------------------------- External Functions --------------------------- */
1498 static void *exec_strandsurface_sample(void *data)
1500 OcclusionThread *othread= (OcclusionThread*)data;
1501 Render *re= othread->re;
1502 StrandSurface *mesh= othread->mesh;
1503 float col[3], co[3], n[3], *co1, *co2, *co3, *co4;
1506 for(a=othread->begin; a<othread->end; a++) {
1507 face= mesh->face[a];
1508 co1= mesh->co[face[0]];
1509 co2= mesh->co[face[1]];
1510 co3= mesh->co[face[2]];
1513 co4= mesh->co[face[3]];
1515 VecLerpf(co, co1, co3, 0.5f);
1516 CalcNormFloat4(co1, co2, co3, co4, n);
1519 CalcCent3f(co, co1, co2, co3);
1520 CalcNormFloat(co1, co2, co3, n);
1524 sample_occ_tree(re, re->occlusiontree, NULL, co, n, othread->thread, 0, col);
1525 VECCOPY(othread->facecol[a], col);
1531 void make_occ_tree(Render *re)
1533 OcclusionThread othreads[BLENDER_MAX_THREADS];
1534 StrandSurface *mesh;
1536 float col[3], (*facecol)[3];
1537 int a, totface, totthread, *face, *count;
1539 /* ugly, needed for occ_face */
1542 re->i.infostr= "Occlusion preprocessing";
1543 re->stats_draw(re->sdh, &re->i);
1545 re->occlusiontree= occ_tree_build(re);
1547 if(re->occlusiontree) {
1548 if(re->wrld.ao_approx_passes)
1549 occ_compute_passes(re, re->occlusiontree, re->wrld.ao_approx_passes);
1551 for(mesh=re->strandsurface.first; mesh; mesh=mesh->next) {
1552 if(!mesh->face || !mesh->co || !mesh->col)
1555 count= MEM_callocN(sizeof(int)*mesh->totvert, "OcclusionCount");
1556 facecol= MEM_callocN(sizeof(float)*3*mesh->totface, "StrandSurfFaceCol");
1558 totthread= (mesh->totface > 10000)? re->r.threads: 1;
1559 totface= mesh->totface/totthread;
1560 for(a=0; a<totthread; a++) {
1562 othreads[a].facecol= facecol;
1563 othreads[a].thread= a;
1564 othreads[a].mesh= mesh;
1565 othreads[a].begin= a*totface;
1566 othreads[a].end= (a == totthread-1)? mesh->totface: (a+1)*totface;
1569 if(totthread == 1) {
1570 exec_strandsurface_sample(&othreads[0]);
1573 BLI_init_threads(&threads, exec_strandsurface_sample, totthread);
1575 for(a=0; a<totthread; a++)
1576 BLI_insert_thread(&threads, &othreads[a]);
1578 BLI_end_threads(&threads);
1581 for(a=0; a<mesh->totface; a++) {
1582 face= mesh->face[a];
1584 VECCOPY(col, facecol[a]);
1585 VECADD(mesh->col[face[0]], mesh->col[face[0]], col);
1587 VECADD(mesh->col[face[1]], mesh->col[face[1]], col);
1589 VECADD(mesh->col[face[2]], mesh->col[face[2]], col);
1593 VECADD(mesh->col[face[3]], mesh->col[face[3]], col);
1598 for(a=0; a<mesh->totvert; a++)
1600 VecMulf(mesh->col[a], 1.0f/count[a]);
1608 void free_occ(Render *re)
1610 if(re->occlusiontree) {
1611 occ_free_tree(re->occlusiontree);
1612 re->occlusiontree = NULL;
1616 void sample_occ(Render *re, ShadeInput *shi)
1618 OcclusionTree *tree= re->occlusiontree;
1619 OcclusionCache *cache;
1620 OcclusionCacheSample *sample;
1626 sample_occ_surface(shi);
1628 /* try to get result from the cache if possible */
1629 else if(shi->depth!=0 || !sample_occ_cache(tree, shi->co, shi->vno, shi->xs, shi->ys, shi->thread, shi->ao)) {
1630 /* no luck, let's sample the occlusion */
1631 exclude.obi= shi->obi - re->objectinstance;
1632 exclude.facenr= shi->vlr->index;
1633 onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1634 sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1636 /* fill result into sample, each time */
1638 cache= &tree->cache[shi->thread];
1640 if(cache->sample && cache->step) {
1641 sample= &cache->sample[(shi->ys-cache->y)*cache->w + (shi->xs-cache->x)];
1642 VECCOPY(sample->co, shi->co);
1643 VECCOPY(sample->n, shi->vno);
1644 VECCOPY(sample->col, shi->ao);
1645 sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1646 sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1659 void cache_occ_samples(Render *re, RenderPart *pa, ShadeSample *ssamp)
1661 OcclusionTree *tree= re->occlusiontree;
1663 OcclusionCache *cache;
1664 OcclusionCacheSample *sample;
1668 int *ro=NULL, *rp=NULL, *rz=NULL, onlyshadow;
1669 int x, y, step = CACHE_STEP;
1674 cache= &tree->cache[pa->thread];
1675 cache->w= pa->rectx;
1676 cache->h= pa->recty;
1677 cache->x= pa->disprect.xmin;
1678 cache->y= pa->disprect.ymin;
1680 cache->sample= MEM_callocN(sizeof(OcclusionCacheSample)*cache->w*cache->h, "OcclusionCacheSample");
1681 sample= cache->sample;
1687 /* fake pixel struct for non-osa */
1696 /* compute a sample at every step pixels */
1697 for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++) {
1698 for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, sample++, rd++, ro++, rp++, rz++) {
1699 if(!(((x - pa->disprect.xmin + step) % step) == 0 || x == pa->disprect.xmax-1))
1701 if(!(((y - pa->disprect.ymin + step) % step) == 0 || y == pa->disprect.ymax-1))
1707 shade_samples_fill_with_ps(ssamp, (PixStr *)(*rd), x, y);
1715 shade_samples_fill_with_ps(ssamp, &ps, x, y);
1720 onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
1721 exclude.obi= shi->obi - re->objectinstance;
1722 exclude.facenr= shi->vlr->index;
1723 sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
1725 VECCOPY(sample->co, shi->co);
1726 VECCOPY(sample->n, shi->vno);
1727 VECCOPY(sample->col, shi->ao);
1728 sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
1729 sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
1735 if(re->test_break(re->tbh))
1741 void free_occ_samples(Render *re, RenderPart *pa)
1743 OcclusionTree *tree= re->occlusiontree;
1744 OcclusionCache *cache;
1747 cache= &tree->cache[pa->thread];
1750 MEM_freeN(cache->sample);