Approximate Ambient Occlusion
authorBrecht Van Lommel <brechtvanlommel@pandora.be>
Thu, 17 Jan 2008 19:27:16 +0000 (19:27 +0000)
committerBrecht Van Lommel <brechtvanlommel@pandora.be>
Thu, 17 Jan 2008 19:27:16 +0000 (19:27 +0000)
=============================

A new approximate ambient occlusion method has been added, next to the
existing one based on raytracing. This method is specifically targetted
at use in animations, since it is inherently noise free, and so will
not flicker across frames.

http://www.blender.org/development/current-projects/changes-since-244/approximate-ambient-occlusion/
http://peach.blender.org/index.php/approximate-ambient-occlusion/

Further improvements are still needed, but it can be tested already. There
are still a number of known issues:

- Bias errors on backfaces.
- For performance, instanced object do not occlude currently.
- Sky textures don't work well, the derivatives for texture evaluation
  are not correct.
- Multiple passes do not work entirely correct (they are not accurate
  to begin with, but could be better).

18 files changed:
source/blender/blenkernel/intern/world.c
source/blender/blenloader/intern/readfile.c
source/blender/makesdna/DNA_world_types.h
source/blender/render/extern/include/RE_shader_ext.h
source/blender/render/intern/include/occlusion.h [new file with mode: 0644]
source/blender/render/intern/include/render_types.h
source/blender/render/intern/include/renderdatabase.h
source/blender/render/intern/include/shading.h
source/blender/render/intern/source/convertblender.c
source/blender/render/intern/source/occlusion.c [new file with mode: 0644]
source/blender/render/intern/source/pipeline.c
source/blender/render/intern/source/rendercore.c
source/blender/render/intern/source/renderdatabase.c
source/blender/render/intern/source/shadeinput.c
source/blender/render/intern/source/shadeoutput.c
source/blender/render/intern/source/strand.c
source/blender/render/intern/source/texture.c
source/blender/src/buttons_shading.c

index 625ca57dbf30d745dfd5c570245c9618ecf30914..8d3234475a417bc3b21776e2bb668a1b49d793ba 100644 (file)
@@ -98,12 +98,12 @@ World *add_world(char *name)
        wrld->exp= 0.0f;
        wrld->exposure=wrld->range= 1.0f;
 
-       wrld->aodist= 5.0;
+       wrld->aodist= 5.0f;
        wrld->aosamp= 5;
-       wrld->aoenergy= 1.0;
-       wrld->aobias= 0.05;
+       wrld->aoenergy= 1.0f;
+       wrld->aobias= 0.05f;
        wrld->ao_samp_method = WO_AOSAMP_HAMMERSLEY;    
-
+       wrld->ao_approx_error= 0.25f;
        
        wrld->physicsEngine= WOPHY_BULLET;//WOPHY_SUMO; Bullet by default
        wrld->preview = NULL;
index 6a2e3aa481af91291793377efc55047ca5e0aa9c..5712f8f03fca27f57435f5feaf7faa7e311c5f79 100644 (file)
@@ -6698,6 +6698,7 @@ static void do_versions(FileData *fd, Library *lib, Main *main)
                Lamp *la;
                Material *ma;
                ParticleSettings *part;
+               World *wrld;
                Mesh *me;
                
                /* unless the file was created 2.44.3 but not 2.45, update the constraints */
@@ -6878,6 +6879,11 @@ static void do_versions(FileData *fd, Library *lib, Main *main)
                        }
                }
 
+               for(wrld=main->world.first; wrld; wrld= wrld->id.next) {
+                       if(wrld->ao_approx_error == 0.0f)
+                               wrld->ao_approx_error= 0.25f;
+               }
+
                if (main->versionfile < 245 || main->subversionfile < 12)
                {
                        /* initialize skeleton generation toolsettings */
index 9e5ec4cdb9a01a20cd99687f985401234ce37ed0..e7a8d839227e309bf1ec8dda21b217802bea758b 100644 (file)
@@ -104,9 +104,8 @@ typedef struct World {
        float aodist, aodistfac, aoenergy, aobias;
        short aomode, aosamp, aomix, aocolor;
        float ao_adapt_thresh, ao_adapt_speed_fac;
-       float pad2[2];
-       short ao_samp_method;
-       short pad1[3];
+       float ao_approx_error, ao_approx_correction;
+       short ao_samp_method, ao_gather_method, ao_approx_passes, pad1;
        
        float *aosphere, *aotables;
        
@@ -151,12 +150,17 @@ typedef struct World {
 /* aomode (use distances & random sampling modes) */
 #define WO_AODIST              1
 #define WO_AORNDSMP            2
+#define WO_AOCACHE             4
 
 /* aocolor */
 #define WO_AOPLAIN     0
 #define WO_AOSKYCOL    1
 #define WO_AOSKYTEX    2
 
+/* ao_gather_method */
+#define WO_AOGATHER_RAYTRACE   0
+#define WO_AOGATHER_APPROX             1
+
 /* texco (also in DNA_material_types.h) */
 #define TEXCO_ANGMAP   64
 #define TEXCO_H_SPHEREMAP      256
index 8971126e643bf4be9d499c999435915a8974446b..5d5db0471d8a01d27d3217ca541d32ac902289c2 100644 (file)
@@ -64,6 +64,7 @@ struct ShadeInputCopy {
        
        struct Material *mat;
        struct VlakRen *vlr;
+       struct StrandRen *strand;
        struct ObjectInstanceRen *obi;
        struct ObjectRen *obr;
        int facenr;
@@ -96,6 +97,7 @@ typedef struct ShadeInput
        
        struct Material *mat;
        struct VlakRen *vlr;
+       struct StrandRen *strand;
        struct ObjectInstanceRen *obi;
        struct ObjectRen *obr;
        int facenr;
@@ -132,7 +134,7 @@ typedef struct ShadeInput
        /* texture coordinates */
        float lo[3], gl[3], ref[3], orn[3], winco[3], sticky[3], vcol[4], rad[3];
        float refcol[4], displace[3];
-       float strand, tang[3], stress, winspeed[4];
+       float strandco, tang[3], stress, winspeed[4];
        float duplilo[3], dupliuv[3];
 
        ShadeInputUV uv[8];   /* 8 = MAX_MTFACE */
diff --git a/source/blender/render/intern/include/occlusion.h b/source/blender/render/intern/include/occlusion.h
new file mode 100644 (file)
index 0000000..3f10d50
--- /dev/null
@@ -0,0 +1,51 @@
+/* 
+ * $Id$
+ *
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The Original Code is Copyright (C) 2008 Blender Foundation.
+ * All rights reserved.
+ *
+ * The Original Code is: all of this file.
+ *
+ * Contributor(s): Brecht Van Lommel.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+#ifndef OCCLUSION_H
+#define OCCLUSION_H
+
+struct Render;
+struct ShadeInput;
+struct ShadeResult;
+struct RenderPart;
+struct ShadeSample;
+struct DerivedMesh;
+struct ObjectRen;
+
+void make_occ_tree(struct Render *re);
+void free_occ(struct Render *re);
+void sample_occ(struct Render *re, struct ShadeInput *shi);
+
+void cache_occ_samples(struct Render *re, struct RenderPart *pa, struct ShadeSample *ssamp);
+void free_occ_samples(struct Render *re, struct RenderPart *pa);
+
+void *cache_occ_mesh(struct Render *re, struct ObjectRen *obr, struct DerivedMesh *dm, float mat[][4]);
+
+#endif
+
index e3a9291219167a21a27bedae79fa01937bf44520..6124870d63ea57448872eb85d00e31ebac4e470f 100644 (file)
@@ -158,6 +158,10 @@ struct Render
        
        /* octree tables and variables for raytrace */
        void *raytree;
+
+       /* occlusion tree */
+       void *occlusiontree;
+       ListBase occlusionmesh;
        
        /* use this instead of R.r.cfra */
        float cfra;     
@@ -341,6 +345,7 @@ typedef struct StrandBuffer {
 
        struct ObjectRen *obr;
        struct Material *ma;
+       void *occlusionmesh;
        unsigned int lay;
        int overrideuv;
        int flag, maxdepth;
index a3a3b4e5e4a72035f8183a6a31b922f7e3bbf0ce..a74579acc4af1aa6ccb53fe036e27cf7e36ae86c 100644 (file)
@@ -70,6 +70,7 @@ typedef struct StrandTableNode {
        float *winspeed;
        float *surfnor;
        float *simplify;
+       int *face;
        struct MCol *mcol;
        float *uv;
        int totuv, totmcol;
@@ -114,6 +115,7 @@ float *RE_strandren_get_surfnor(struct ObjectRen *obr, struct StrandRen *strand,
 float *RE_strandren_get_uv(struct ObjectRen *obr, struct StrandRen *strand, int n, char **name, int verify);
 struct MCol *RE_strandren_get_mcol(struct ObjectRen *obr, struct StrandRen *strand, int n, char **name, int verify);
 float *RE_strandren_get_simplify(struct ObjectRen *obr, struct StrandRen *strand, int verify);
+int *RE_strandren_get_face(struct ObjectRen *obr, struct StrandRen *strand, int verify);
 float *RE_strandren_get_winspeed(struct ObjectInstanceRen *obi, struct StrandRen *strand, int verify);
 
 struct VertRen *RE_vertren_copy(struct ObjectRen *obr, struct VertRen *ver);
index 6ca63a2976d3041edf0862584927c79f897190a8..e3ffe5ef28fb217ef6edd271afb6773a971a26c0 100644 (file)
@@ -69,6 +69,7 @@ void shade_input_initialize(struct ShadeInput *shi, struct RenderPart *pa, struc
 
 void shade_sample_initialize(struct ShadeSample *ssamp, struct RenderPart *pa, struct RenderLayer *rl);
 void shade_samples_do_AO(struct ShadeSample *ssamp);
+void shade_samples_fill_with_ps(struct ShadeSample *ssamp, struct PixStr *ps, int x, int y);
 int shade_samples(struct ShadeSample *ssamp, struct PixStr *ps, int x, int y);
 
 void vlr_set_uv_indices(struct VlakRen *vlr, int *i1, int *i2, int *i3);
index 25b5dac2a55a405e30c474092ffa17cb009d859c..8117a85d0158cf36d5c4768d78d510ee71479f5e 100644 (file)
 
 #include "envmap.h"
 #include "multires.h"
+#include "occlusion.h"
 #include "render_types.h"
 #include "rendercore.h"
 #include "renderdatabase.h"
@@ -1497,7 +1498,7 @@ static int render_new_particle_system(Render *re, ObjectRen *obr, ParticleSystem
        float *orco=0,*surfnor=0,*uvco=0, strandlen=0.0f, curlen=0.0f;
        float hasize, pa_size, pa_time, r_tilt, cfra=bsystem_time(ob,(float)CFRA,0.0);
        float adapt_angle=0.0, adapt_pix=0.0, random, simplify[2];
-       int i, a, k, max_k=0, totpart, totuv=0, override_uv=-1, dosimplify = 0;
+       int i, a, k, max_k=0, totpart, totuv=0, override_uv=-1, dosimplify = 0, doapproxao = 0;
        int path_possible=0, keys_possible=0, baked_keys=0, totchild=psys->totchild;
        int seed, path_nbr=0, path=0, orco1=0, adapt=0, uv[3]={0,0,0}, num;
        char **uv_name=0;
@@ -1656,6 +1657,10 @@ static int render_new_particle_system(Render *re, ObjectRen *obr, ParticleSystem
                                        strandbuf->flag |= R_STRAND_B_UNITS;
 
                                svert= strandbuf->vert;
+
+                               if((re->wrld.mode & WO_AMB_OCC) && (re->wrld.ao_gather_method == WO_AOGATHER_APPROX))
+                                       if(ma->amb != 0.0f)
+                                               doapproxao= 1;
                        }
                }
        }
@@ -1704,14 +1709,14 @@ static int render_new_particle_system(Render *re, ObjectRen *obr, ParticleSystem
                        else
                                psys_particle_on_emitter(ob, psmd,part->from,pa->num,pa->num_dmcache,pa->fuv,pa->foffset,co,nor,0,0,orco,0);
 
-                       if(uvco && ELEM(part->from,PART_FROM_FACE,PART_FROM_VOLUME)){
-                               layer=psmd->dm->faceData.layers + CustomData_get_layer_index(&psmd->dm->faceData,CD_MFACE);
+                       num= pa->num_dmcache;
 
-                   num= pa->num_dmcache;
+                       if(num == DMCACHE_NOTFOUND)
+                               if(pa->num < psmd->dm->getNumFaces(psmd->dm))
+                                       num= pa->num;
 
-                               if(num == DMCACHE_NOTFOUND)
-                                       if(pa->num < psmd->dm->getNumFaces(psmd->dm))
-                                               num= pa->num;
+                       if(uvco && ELEM(part->from,PART_FROM_FACE,PART_FROM_VOLUME)){
+                               layer=psmd->dm->faceData.layers + CustomData_get_layer_index(&psmd->dm->faceData,CD_MFACE);
 
                                for(i=0; i<totuv; i++){
                                        if(num != DMCACHE_NOTFOUND) {
@@ -1761,6 +1766,8 @@ static int render_new_particle_system(Render *re, ObjectRen *obr, ParticleSystem
 
                        r_tilt=2.0f*cpa->rand[2];
 
+                       num= cpa->num;
+
                        /* get orco */
                        psys_particle_on_emitter(ob, psmd,
                                (part->childtype == PART_CHILD_FACES)? PART_FROM_FACE: PART_FROM_PARTICLE,
@@ -1831,6 +1838,11 @@ static int render_new_particle_system(Render *re, ObjectRen *obr, ParticleSystem
                                VECCOPY(snor, surfnor);
                        }
 
+                       if(doapproxao && num >= 0) {
+                               int *facenum= RE_strandren_get_face(obr, strand, 1);
+                               *facenum= num;
+                       }
+
                        if(uvco){
                                for(i=0; i<totuv; i++){
                                        if(i != override_uv) {
@@ -1930,6 +1942,9 @@ static int render_new_particle_system(Render *re, ObjectRen *obr, ParticleSystem
                        break;
        }
 
+       if(doapproxao)
+               strandbuf->occlusionmesh= cache_occ_mesh(re, obr, psmd->dm, mat);
+
 /* 4. clean up */
        if(ma) do_mat_ipo(ma);
 
@@ -3636,7 +3651,7 @@ void init_render_world(Render *re)
                if(re->osa)
                        while(re->wrld.aosamp*re->wrld.aosamp < re->osa) 
                                re->wrld.aosamp++;
-               if(!(re->r.mode & R_RAYTRACE))
+               if(!(re->r.mode & R_RAYTRACE) && (re->wrld.ao_gather_method == WO_AOGATHER_RAYTRACE))
                        re->wrld.mode &= ~WO_AMB_OCC;
        }
        else {
@@ -4069,6 +4084,7 @@ void RE_Database_Free(Render *re)
        if(re->r.mode & R_RAYTRACE) freeraytree(re);
 
        free_sss(re);
+       free_occ(re);
        
        re->totvlak=re->totvert=re->totstrand=re->totlamp=re->tothalo= 0;
        re->i.convertdone= 0;
@@ -4289,7 +4305,7 @@ void RE_Database_FromScene(Render *re, Scene *scene, int use_camera_view)
        }
        
        init_render_world(re);  /* do first, because of ambient. also requires re->osa set correct */
-       if(re->wrld.mode & WO_AMB_OCC) {
+       if((re->r.mode & R_RAYTRACE) && (re->wrld.mode & WO_AMB_OCC)) {
                if (re->wrld.ao_samp_method == WO_AOSAMP_HAMMERSLEY)
                        init_render_hammersley(re);
                else if (re->wrld.ao_samp_method == WO_AOSAMP_CONSTANT)
@@ -4354,10 +4370,16 @@ void RE_Database_FromScene(Render *re, Scene *scene, int use_camera_view)
                
                if(!re->test_break())
                        project_renderdata(re, projectverto, re->r.mode & R_PANORAMA, 0, 1);
+               
+               /* Occlusion */
+               if((re->wrld.mode & WO_AMB_OCC) && !re->test_break())
+                       if(re->wrld.ao_gather_method == WO_AOGATHER_APPROX)
+                               if(re->r.renderer==R_INTERN)
+                                       make_occ_tree(re);
 
                /* SSS */
                if((re->r.mode & R_SSS) && !re->test_break())
-                       if (re->r.renderer==R_INTERN)
+                       if(re->r.renderer==R_INTERN)
                                make_sss_tree(re);
        }
        
@@ -4837,7 +4859,7 @@ void RE_Database_Baking(Render *re, Scene *scene, int type, Object *actob)
        }
        
        init_render_world(re);  /* do first, because of ambient. also requires re->osa set correct */
-       if(re->wrld.mode & WO_AMB_OCC) {
+       if((re->r.mode & R_RAYTRACE) && (re->wrld.mode & WO_AMB_OCC)) {
                if (re->wrld.ao_samp_method == WO_AOSAMP_HAMMERSLEY)
                        init_render_hammersley(re);
                else if (re->wrld.ao_samp_method == WO_AOSAMP_CONSTANT)
diff --git a/source/blender/render/intern/source/occlusion.c b/source/blender/render/intern/source/occlusion.c
new file mode 100644 (file)
index 0000000..19394b4
--- /dev/null
@@ -0,0 +1,1694 @@
+/* 
+ * $Id$
+ *
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The Original Code is Copyright (C) 2008 Blender Foundation.
+ * All rights reserved.
+ *
+ * The Original Code is: all of this file.
+ *
+ * Contributor(s): Brecht Van Lommel.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "MEM_guardedalloc.h"
+
+#include "DNA_material_types.h"
+#include "DNA_meshdata_types.h"
+
+#include "BLI_arithb.h"
+#include "BLI_blenlib.h"
+#include "BLI_memarena.h"
+#include "BLI_threads.h"
+
+#include "BKE_DerivedMesh.h"
+#include "BKE_global.h"
+#include "BKE_utildefines.h"
+
+#include "RE_shader_ext.h"
+
+/* local includes */
+#include "occlusion.h"
+#include "render_types.h"
+#include "rendercore.h"
+#include "renderdatabase.h"
+#include "pixelshading.h"
+#include "shading.h"
+#include "zbuf.h"
+
+/* ------------------------- Declarations --------------------------- */
+
+#define INVALID_INDEX ((int)(~0))
+#define INVPI 0.31830988618379069f
+#define TOTCHILD 8
+#define CACHE_STEP 3
+
+typedef struct OcclusionCacheSample {
+       float co[3], n[3], col[3], intensity, dist2;
+       int x, y, filled;
+} OcclusionCacheSample;
+
+typedef struct OcclusionCache {
+       OcclusionCacheSample *sample;
+       int x, y, w, h, step;
+} OcclusionCache;
+
+typedef struct OcclusionCacheMesh {
+       struct OcclusionCacheMesh *next, *prev;
+       ObjectRen *obr;
+       int (*face)[4];
+       float (*co)[3];
+       float (*col)[3];
+       int totvert, totface;
+} OcclusionCacheMesh;
+
+typedef struct OccFace {
+       int obi;
+       int facenr;
+} OccFace;
+
+typedef struct OccNode {
+       float co[3], area;
+       float sh[9], dco;
+       float occlusion;
+       int childflag;
+       union {
+               //OccFace face;
+               int face;
+               struct OccNode *node;
+       } child[TOTCHILD];
+} OccNode;
+
+typedef struct OcclusionTree {
+       MemArena *arena;
+
+       float (*co)[3];         /* temporary during build */
+
+       OccFace *face;          /* instance and face indices */
+       float *occlusion;       /* occlusion for faces */
+       
+       OccNode *root;
+
+       OccNode **stack[BLENDER_MAX_THREADS];
+       int maxdepth;
+
+       int totface;
+
+       float error;
+       float distfac;
+
+       OcclusionCache *cache;
+} OcclusionTree;
+
+/* ------------------------- Shading --------------------------- */
+
+extern Render R; // meh
+
+#if 0
+static void occ_shade(ShadeSample *ssamp, ObjectInstanceRen *obi, VlakRen *vlr, float *rad)
+{
+       ShadeInput *shi= ssamp->shi;
+       ShadeResult *shr= ssamp->shr;
+       float l, u, v, *v1, *v2, *v3;
+       
+       /* init */
+       if(vlr->v4) {
+               shi->u= u= 0.5f;
+               shi->v= v= 0.5f;
+       }
+       else {
+               shi->u= u= 1.0f/3.0f;
+               shi->v= v= 1.0f/3.0f;
+       }
+
+       /* setup render coordinates */
+       v1= vlr->v1->co;
+       v2= vlr->v2->co;
+       v3= vlr->v3->co;
+       
+       /* renderco */
+       l= 1.0f-u-v;
+       
+       shi->co[0]= l*v3[0]+u*v1[0]+v*v2[0];
+       shi->co[1]= l*v3[1]+u*v1[1]+v*v2[1];
+       shi->co[2]= l*v3[2]+u*v1[2]+v*v2[2];
+       
+       shade_input_set_triangle_i(shi, obi, vlr, 0, 1, 2);
+
+       /* set up view vector */
+       VECCOPY(shi->view, shi->co);
+       Normalize(shi->view);
+       
+       /* cache for shadow */
+       shi->samplenr++;
+       
+       shi->xs= 0; // TODO
+       shi->ys= 0;
+       
+       shade_input_set_normals(shi);
+
+       /* no normal flip */
+       if(shi->flippednor)
+               shade_input_flip_normals(shi);
+
+       /* not a pretty solution, but fixes common cases */
+       if(shi->obr->ob && shi->obr->ob->transflag & OB_NEG_SCALE) {
+               VecMulf(shi->vn, -1.0f);
+               VecMulf(shi->vno, -1.0f);
+       }
+
+       /* init material vars */
+       // note, keep this synced with render_types.h
+       memcpy(&shi->r, &shi->mat->r, 23*sizeof(float));
+       shi->har= shi->mat->har;
+       
+       /* render */
+       shade_input_set_shade_texco(shi);
+       shade_material_loop(shi, shr); /* todo: nodes */
+       
+       VECCOPY(rad, shr->combined);
+}
+
+static void occ_build_shade(Render *re, OcclusionTree *tree)
+{
+       ShadeSample ssamp;
+       ObjectInstanceRen *obi;
+       VlakRen *vlr;
+       int a;
+
+       R= *re;
+
+       /* setup shade sample with correct passes */
+       memset(&ssamp, 0, sizeof(ShadeSample));
+       ssamp.shi[0].lay= re->scene->lay;
+       ssamp.shi[0].passflag= SCE_PASS_DIFFUSE|SCE_PASS_RGBA;
+       ssamp.shi[0].combinedflag= ~(SCE_PASS_SPEC);
+       ssamp.tot= 1;
+
+       for(a=0; a<tree->totface; a++) {
+               obi= &R.objectinstance[tree->face[a].obi];
+               vlr= RE_findOrAddVlak(obi->obr, tree->face[a].vlr);
+
+               occ_shade(&ssamp, obi, vlr, tree->rad[a]);
+       }
+}
+#endif
+
+/* ------------------------- Spherical Harmonics --------------------------- */
+
+/* Use 2nd order SH => 9 coefficients, stored in this order:
+   0 = (0,0),
+   1 = (1,-1), 2 = (1,0), 3 = (1,1),
+   4 = (2,-2), 5 = (2,-1), 6 = (2,0), 7 = (2,1), 8 = (2,2) */
+
+static void sh_copy(float *shresult, float *sh)
+{
+       memcpy(shresult, sh, sizeof(float)*9);
+}
+
+static void sh_mul(float *sh, float f)
+{
+       int i;
+
+       for(i=0; i<9; i++)
+               sh[i] *= f;
+}
+
+static void sh_add(float *shresult, float *sh1, float *sh2)
+{
+       int i;
+
+       for(i=0; i<9; i++)
+               shresult[i]= sh1[i] + sh2[i];
+}
+
+static void sh_from_disc(float *n, float area, float *shresult)
+{
+       /* See formula (3) in:
+          "An Efficient Representation for Irradiance Environment Maps" */
+       float sh[9], x, y, z;
+
+       x= n[0];
+       y= n[1];
+       z= n[2];
+
+       sh[0]= 0.282095f;
+
+       sh[1]= 0.488603f*y;
+       sh[2]= 0.488603f*z;
+       sh[3]= 0.488603f*x;
+       
+       sh[4]= 1.092548f*x*y;
+       sh[5]= 1.092548f*y*z;
+       sh[6]= 0.315392f*(3.0f*z*z - 1.0f);
+       sh[7]= 1.092548f*x*z;
+       sh[8]= 0.546274f*(x*x - y*y);
+
+       sh_mul(sh, area);
+       sh_copy(shresult, sh);
+}
+
+static float sh_eval(float *sh, float *v)
+{
+       /* See formula (13) in:
+          "An Efficient Representation for Irradiance Environment Maps" */
+       static const float c1 = 0.429043f, c2 = 0.511664f, c3 = 0.743125f;
+       static const float c4 = 0.886227f, c5 = 0.247708f;
+       float x, y, z, sum;
+
+       x= v[0];
+       y= v[1];
+       z= v[2];
+
+       sum= c1*sh[8]*(x*x - y*y);
+       sum += c3*sh[6]*z*z;
+       sum += c4*sh[0];
+       sum += -c5*sh[6];
+       sum += 2.0f*c1*(sh[4]*x*y + sh[7]*x*z + sh[5]*y*z);
+       sum += 2.0f*c2*(sh[3]*x + sh[1]*y + sh[2]*z);
+
+       return sum;
+}
+
+/* ------------------------------ Building --------------------------------- */
+
+static void occ_face(const OccFace *face, float *co, float *normal, float *area)
+{
+       ObjectInstanceRen *obi;
+       VlakRen *vlr;
+
+       obi= &R.objectinstance[face->obi];
+       vlr= RE_findOrAddVlak(obi->obr, face->facenr);
+       
+       if(co) {
+               if(vlr->v4)
+                       VecLerpf(co, vlr->v1->co, vlr->v3->co, 0.5f);
+               else
+                       CalcCent3f(co, vlr->v1->co, vlr->v2->co, vlr->v3->co);
+
+               if(obi->flag & R_TRANSFORMED)
+                       Mat4MulVecfl(obi->mat, co);
+       }
+       
+       if(normal) {
+               normal[0]= -vlr->n[0];
+               normal[1]= -vlr->n[1];
+               normal[2]= -vlr->n[2];
+
+               if(obi->flag & R_TRANSFORMED) {
+                       float xn, yn, zn, (*imat)[3]= obi->imat;
+
+                       xn= normal[0];
+                       yn= normal[1];
+                       zn= normal[2];
+
+                       normal[0]= imat[0][0]*xn + imat[0][1]*yn + imat[0][2]*zn;
+                       normal[1]= imat[1][0]*xn + imat[1][1]*yn + imat[1][2]*zn;
+                       normal[2]= imat[2][0]*xn + imat[2][1]*yn + imat[2][2]*zn;
+               }
+       }
+
+       if(area) {
+               /* todo: correct area for instances */
+               if(vlr->v4)
+                       *area= AreaQ3Dfl(vlr->v1->co, vlr->v2->co, vlr->v3->co, vlr->v4->co);
+               else
+                       *area= AreaT3Dfl(vlr->v1->co, vlr->v2->co, vlr->v3->co);
+       }
+}
+
+static void occ_sum_occlusion(OcclusionTree *tree, OccNode *node)
+{
+       OccNode *child;
+       float occ, area, totarea;
+       int a, b;
+
+       occ= 0.0f;
+       totarea= 0.0f;
+
+       for(b=0; b<TOTCHILD; b++) {
+               if(node->childflag & (1<<b)) {
+                       a= node->child[b].face;
+                       occ_face(&tree->face[a], 0, 0, &area);
+                       occ += area*tree->occlusion[a];
+                       totarea += area;
+               }
+               else if(node->child[b].node) {
+                       child= node->child[b].node;
+                       occ_sum_occlusion(tree, child);
+
+                       occ += child->area*child->occlusion;
+                       totarea += child->area;
+               }
+       }
+
+       if(totarea != 0.0f)
+               occ /= totarea;
+       
+       node->occlusion= occ;
+}
+
+static int occ_find_bbox_axis(OcclusionTree *tree, int begin, int end, float *min, float *max)
+{
+       float len, maxlen= -1.0f;
+       int a, axis = 0;
+
+       INIT_MINMAX(min, max);
+
+       for(a=begin; a<end; a++)
+               DO_MINMAX(tree->co[a], min, max)
+
+       for(a=0; a<3; a++) {
+               len= max[a] - min[a];
+
+               if(len > maxlen) {
+                       maxlen= len;
+                       axis= a;
+               }
+       }
+
+       return axis;
+}
+
+void occ_node_from_face(OccFace *face, OccNode *node)
+{
+       float n[3];
+
+       occ_face(face, node->co, n, &node->area);
+       node->dco= 0.0f;
+       sh_from_disc(n, node->area, node->sh);
+}
+
+static void occ_build_dco(OcclusionTree *tree, OccNode *node, float *co, float *dco)
+{
+       OccNode *child;
+       float dist, d[3], nco[3];
+       int b;
+
+       for(b=0; b<TOTCHILD; b++) {
+               if(node->childflag & (1<<b)) {
+                       occ_face(tree->face+node->child[b].face, nco, 0, 0);
+               }
+               else if(node->child[b].node) {
+                       child= node->child[b].node;
+                       occ_build_dco(tree, child, co, dco);
+                       VECCOPY(nco, child->co);
+               }
+
+               VECSUB(d, nco, co);
+               dist= INPR(d, d);
+               if(dist > *dco)
+                       *dco= dist;
+       }
+}
+
+static void occ_build_split(OcclusionTree *tree, int begin, int end, int *split)
+{
+       float min[3], max[3], mid;
+       int axis, a, enda;
+
+       /* split in middle of boundbox. this seems faster than median split
+        * on complex scenes, possibly since it avoids two distant faces to
+        * be in the same node better? */
+       axis= occ_find_bbox_axis(tree, begin, end, min, max);
+       mid= 0.5f*(min[axis]+max[axis]);
+
+       a= begin;
+       enda= end;
+       while(a<enda) {
+               if(tree->co[a][axis] > mid) {
+                       enda--;
+                       SWAP(OccFace, tree->face[a], tree->face[enda]);
+                       SWAP(float, tree->co[a][0], tree->co[enda][0]);
+                       SWAP(float, tree->co[a][1], tree->co[enda][1]);
+                       SWAP(float, tree->co[a][2], tree->co[enda][2]);
+               }
+               else
+                       a++;
+       }
+
+       *split= enda;
+}
+
+static void occ_build_8_split(OcclusionTree *tree, int begin, int end, int *offset, int *count)
+{
+       /* split faces into eight groups */
+       int b, splitx, splity[2], splitz[4];
+
+       occ_build_split(tree, begin, end, &splitx);
+
+       occ_build_split(tree, begin, splitx, &splity[0]);
+       occ_build_split(tree, splitx, end, &splity[1]);
+
+       occ_build_split(tree, begin, splity[0], &splitz[0]);
+       occ_build_split(tree, splity[0], splitx, &splitz[1]);
+       occ_build_split(tree, splitx, splity[1], &splitz[2]);
+       occ_build_split(tree, splity[1], end, &splitz[3]);
+
+       offset[0]= begin;
+       offset[1]= splitz[0];
+       offset[2]= splity[0];
+       offset[3]= splitz[1];
+       offset[4]= splitx;
+       offset[5]= splitz[2];
+       offset[6]= splity[1];
+       offset[7]= splitz[3];
+
+       for(b=0; b<7; b++)
+               count[b]= offset[b+1] - offset[b];
+       count[7]= end - offset[7];
+}
+
+static OccNode *occ_build_recursive(OcclusionTree *tree, int begin, int end, int depth)
+{
+       OccNode *node, *child, tmpnode;
+       OccFace *face;
+       int a, b, offset[TOTCHILD], count[TOTCHILD];
+
+       /* keep track of maximum depth for stack */
+       if(depth > tree->maxdepth)
+               tree->maxdepth= depth;
+
+       /* add a new node */
+       node= BLI_memarena_alloc(tree->arena, sizeof(OccNode));
+       node->occlusion= 1.0f;
+
+       /* leaf node with only children */
+       if(end - begin <= TOTCHILD) {
+               for(a=begin, b=0; a<end; a++, b++) {
+                       face= &tree->face[a];
+                       node->child[b].face= a;
+                       node->childflag |= (1<<b);
+               }
+       }
+       else {
+               /* order faces */
+               occ_build_8_split(tree, begin, end, offset, count);
+
+               for(b=0; b<TOTCHILD; b++) {
+                       if(count[b] == 0) {
+                               node->child[b].node= NULL;
+                       }
+                       else if(count[b] == 1) {
+                               face= &tree->face[offset[b]];
+                               node->child[b].face= offset[b];
+                               node->childflag |= (1<<b);
+                       }
+                       else {
+                               child= occ_build_recursive(tree, offset[b], offset[b]+count[b], depth+1);
+                               node->child[b].node= child;
+                       }
+               }
+       }
+
+       /* combine area, position and sh */
+       for(b=0; b<TOTCHILD; b++) {
+               if(node->childflag & (1<<b)) {
+                       child= &tmpnode;
+                       occ_node_from_face(tree->face+node->child[b].face, &tmpnode);
+               }
+               else {
+                       child= node->child[b].node;
+               }
+
+               if(child) {
+                       node->area += child->area;
+                       sh_add(node->sh, node->sh, child->sh);
+                       VECADDFAC(node->co, node->co, child->co, child->area);
+               }
+       }
+
+       if(node->area != 0.0f)
+               VecMulf(node->co, 1.0f/node->area);
+
+       /* compute maximum distance from center */
+       node->dco= 0.0f;
+       occ_build_dco(tree, node, node->co, &node->dco);
+
+       return node;
+}
+
+static void occ_build_sh_normalize(OccNode *node)
+{
+       /* normalize spherical harmonics to not include area, so
+        * we can clamp the dot product and then mutliply by area */
+       int b;
+
+       if(node->area != 0.0f)
+               sh_mul(node->sh, 1.0f/node->area);
+
+       for(b=0; b<TOTCHILD; b++) {
+               if(node->childflag & (1<<b));
+               else if(node->child[b].node)
+                       occ_build_sh_normalize(node->child[b].node);
+       }
+}
+
+static OcclusionTree *occ_tree_build(Render *re)
+{
+       OcclusionTree *tree;
+       ObjectInstanceRen *obi;
+       ObjectRen *obr;
+       VlakRen *vlr= NULL;
+       int a, b, c, totface;
+
+       /* count */
+       totface= 0;
+       for(obi=re->instancetable.first; obi; obi=obi->next) {
+               if(obi->flag & R_TRANSFORMED)
+                       continue;
+
+               obr= obi->obr;
+               for(a=0; a<obr->totvlak; a++) {
+                       if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
+                       else vlr++;
+
+                       if(vlr->mat->mode & MA_TRACEBLE)
+                               totface++;
+               }
+       }
+
+       if(totface == 0)
+               return NULL;
+       
+       tree= MEM_callocN(sizeof(OcclusionTree), "OcclusionTree");
+       tree->totface= totface;
+
+       /* parameters */
+       tree->error= re->wrld.ao_approx_error;
+       tree->distfac= (re->wrld.aomode & WO_AODIST)? re->wrld.aodistfac: 0.0f;
+
+       /* allocation */
+       tree->arena= BLI_memarena_new(0x8000 * sizeof(OccNode));
+       BLI_memarena_use_calloc(tree->arena);
+
+       if(re->wrld.aomode & WO_AOCACHE)
+               tree->cache= MEM_callocN(sizeof(OcclusionCache)*BLENDER_MAX_THREADS, "OcclusionCache");
+
+       tree->face= MEM_callocN(sizeof(OccFace)*totface, "OcclusionFace");
+       tree->co= MEM_callocN(sizeof(float)*3*totface, "OcclusionCo");
+       tree->occlusion= MEM_callocN(sizeof(float)*totface, "OcclusionOcclusion");
+
+       /* make array of face pointers */
+       for(b=0, c=0, obi=re->instancetable.first; obi; obi=obi->next, c++) {
+               if(obi->flag & R_TRANSFORMED)
+                       continue; /* temporary to avoid slow renders with loads of duplis */
+
+               obr= obi->obr;
+               for(a=0; a<obr->totvlak; a++) {
+                       if((a & 255)==0) vlr= obr->vlaknodes[a>>8].vlak;
+                       else vlr++;
+
+                       if(vlr->mat->mode & MA_TRACEBLE) {
+                               tree->face[b].obi= c;
+                               tree->face[b].facenr= a;
+                               tree->occlusion[b]= 1.0f;
+                               occ_face(&tree->face[b], tree->co[b], NULL, NULL); 
+                               b++;
+                       }
+               }
+       }
+
+       /* recurse */
+       tree->root= occ_build_recursive(tree, 0, totface, 1);
+
+#if 0
+       if(tree->doindirect) {
+               occ_build_shade(re, tree);
+               occ_sum_occlusion(tree, tree->root);
+       }
+#endif
+       
+       MEM_freeN(tree->co);
+       tree->co= NULL;
+
+       occ_build_sh_normalize(tree->root);
+
+       for(a=0; a<BLENDER_MAX_THREADS; a++)
+               tree->stack[a]= MEM_callocN(sizeof(OccNode)*TOTCHILD*(tree->maxdepth+1), "OccStack");
+
+       return tree;
+}
+
+static void occ_free_tree(OcclusionTree *tree)
+{
+       int a;
+
+       if(tree) {
+               if(tree->arena) BLI_memarena_free(tree->arena);
+               for(a=0; a<BLENDER_MAX_THREADS; a++)
+                       if(tree->stack[a])
+                               MEM_freeN(tree->stack[a]);
+               if(tree->occlusion) MEM_freeN(tree->occlusion);
+               if(tree->face) MEM_freeN(tree->face);
+               if(tree->cache) MEM_freeN(tree->cache);
+               MEM_freeN(tree);
+       }
+}
+
+/* ------------------------- Traversal --------------------------- */
+
+static float occ_solid_angle(OccNode *node, float *v, float d2, float *receivenormal)
+{
+       float dotreceive, dotemit, invd2 = 1.0f/sqrtf(d2);
+       float ev[3];
+
+       ev[0]= -v[0]*invd2;
+       ev[1]= -v[1]*invd2;
+       ev[2]= -v[2]*invd2;
+       dotemit= sh_eval(node->sh, ev);
+       dotreceive= INPR(receivenormal, v)*invd2;
+
+       CLAMP(dotemit, 0.0f, 1.0f);
+       CLAMP(dotreceive, 0.0f, 1.0f);
+       
+       return ((node->area*dotemit*dotreceive)/(d2 + node->area*INVPI))*INVPI;
+}
+
+static void VecAddDir(float *result, float *v1, float *v2, float fac)
+{
+       result[0]= v1[0] + fac*(v2[0] - v1[0]);
+       result[1]= v1[1] + fac*(v2[1] - v1[1]);
+       result[2]= v1[2] + fac*(v2[2] - v1[2]);
+}
+
+static int occ_visible_quad(float *p, float *n, float *v0, float *v1, float *v2, float *q0, float *q1, float *q2, float *q3)
+{
+       static const float epsilon = 1e-6f;
+       float c, sd[3];
+       
+       c= INPR(n, p);
+
+       /* signed distances from the vertices to the plane. */
+       sd[0]= INPR(n, v0) - c;
+       sd[1]= INPR(n, v1) - c;
+       sd[2]= INPR(n, v2) - c;
+
+       if(fabs(sd[0]) < epsilon) sd[0] = 0.0f;
+       if(fabs(sd[1]) < epsilon) sd[1] = 0.0f;
+       if(fabs(sd[2]) < epsilon) sd[2] = 0.0f;
+
+       if(sd[0] > 0) {
+               if(sd[1] > 0) {
+                       if(sd[2] > 0) {
+                               // +++
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+                       else if(sd[2] < 0) {
+                               // ++-
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
+                               VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
+                       }
+                       else {
+                               // ++0
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+               }
+               else if(sd[1] < 0) {
+                       if(sd[2] > 0) {
+                               // +-+
+                               VECCOPY(q0, v0);
+                               VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
+                               VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
+                               VECCOPY(q3, v2);
+                       }
+                       else if(sd[2] < 0) {
+                               // +--
+                               VECCOPY(q0, v0);
+                               VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
+                               VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
+                               VECCOPY(q3, q2);
+                       }
+                       else {
+                               // +-0
+                               VECCOPY(q0, v0);
+                               VecAddDir(q1, v0, v1, (sd[0]/(sd[0]-sd[1])));
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+               }
+               else {
+                       if(sd[2] > 0) {
+                               // +0+
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+                       else if(sd[2] < 0) {
+                               // +0-
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VecAddDir(q2, v0, v2, (sd[0]/(sd[0]-sd[2])));
+                               VECCOPY(q3, q2);
+                       }
+                       else {
+                               // +00
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+               }
+       }
+       else if(sd[0] < 0) {
+               if(sd[1] > 0) {
+                       if(sd[2] > 0) {
+                               // -++
+                               VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VecAddDir(q3, v0, v2, (sd[0]/(sd[0]-sd[2])));
+                       }
+                       else if(sd[2] < 0) {
+                               // -+-
+                               VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
+                               VECCOPY(q1, v1);
+                               VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
+                               VECCOPY(q3, q2);
+                       }
+                       else {
+                               // -+0
+                               VecAddDir(q0, v0, v1, (sd[0]/(sd[0]-sd[1])));
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+               }
+               else if(sd[1] < 0) {
+                       if(sd[2] > 0) {
+                               // --+
+                               VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
+                               VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+                       else if(sd[2] < 0) {
+                               // ---
+                               return 0;
+                       }
+                       else {
+                               // --0
+                               return 0;
+                       }
+               }
+               else {
+                       if(sd[2] > 0) {
+                               // -0+
+                               VecAddDir(q0, v0, v2, (sd[0]/(sd[0]-sd[2])));
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+                       else if(sd[2] < 0) {
+                               // -0-
+                               return 0;
+                       }
+                       else {
+                               // -00
+                               return 0;
+                       }
+               }
+       }
+       else {
+               if(sd[1] > 0) {
+                       if(sd[2] > 0) {
+                               // 0++
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+                       else if(sd[2] < 0) {
+                               // 0+-
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VecAddDir(q2, v1, v2, (sd[1]/(sd[1]-sd[2])));
+                               VECCOPY(q3, q2);
+                       }
+                       else {
+                               // 0+0
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+               }
+               else if(sd[1] < 0) {
+                       if(sd[2] > 0) {
+                               // 0-+
+                               VECCOPY(q0, v0);
+                               VecAddDir(q1, v1, v2, (sd[1]/(sd[1]-sd[2])));
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+                       else if(sd[2] < 0) {
+                               // 0--
+                               return 0;
+                       }
+                       else {
+                               // 0-0
+                               return 0;
+                       }
+               }
+               else {
+                       if(sd[2] > 0) {
+                               // 00+
+                               VECCOPY(q0, v0);
+                               VECCOPY(q1, v1);
+                               VECCOPY(q2, v2);
+                               VECCOPY(q3, q2);
+                       }
+                       else if(sd[2] < 0) {
+                               // 00-
+                               return 0;
+                       }
+                       else {
+                               // 000
+                               return 0;
+                       }
+               }
+       }
+
+       return 1;
+}
+
+/* altivec optimization, this works, but is unused */
+
+#if 0
+#include <Accelerate/Accelerate.h>
+
+typedef union {
+       vFloat v;
+       float f[4];
+} vFloatResult;
+
+static vFloat vec_splat_float(float val)
+{
+       return (vFloat){val, val, val, val};
+}
+
+static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
+{
+       vFloat vcos, rlen, vrx, vry, vrz, vsrx, vsry, vsrz, gx, gy, gz, vangle;
+       vUInt8 rotate = (vUInt8){4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3};
+       vFloatResult vresult;
+       float result;
+
+       /* compute r* */
+       vrx = (vFloat){q0[0], q1[0], q2[0], q3[0]} - vec_splat_float(p[0]);
+       vry = (vFloat){q0[1], q1[1], q2[1], q3[1]} - vec_splat_float(p[1]);
+       vrz = (vFloat){q0[2], q1[2], q2[2], q3[2]} - vec_splat_float(p[2]);
+
+       /* normalize r* */
+       rlen = vec_rsqrte(vrx*vrx + vry*vry + vrz*vrz + vec_splat_float(1e-16f));
+       vrx = vrx*rlen;
+       vry = vry*rlen;
+       vrz = vrz*rlen;
+
+       /* rotate r* for cross and dot */
+       vsrx= vec_perm(vrx, vrx, rotate);
+       vsry= vec_perm(vry, vry, rotate);
+       vsrz= vec_perm(vrz, vrz, rotate);
+
+       /* cross product */
+       gx = vsry*vrz - vsrz*vry;
+       gy = vsrz*vrx - vsrx*vrz;
+       gz = vsrx*vry - vsry*vrx;
+
+       /* normalize */
+       rlen = vec_rsqrte(gx*gx + gy*gy + gz*gz + vec_splat_float(1e-16f));
+       gx = gx*rlen;
+       gy = gy*rlen;
+       gz = gz*rlen;
+
+       /* angle */
+       vcos = vrx*vsrx + vry*vsry + vrz*vsrz;
+       vcos= vec_max(vec_min(vcos, vec_splat_float(1.0f)), vec_splat_float(-1.0f));
+       vangle= vacosf(vcos);
+
+       /* dot */
+       vresult.v = (vec_splat_float(n[0])*gx +
+                    vec_splat_float(n[1])*gy +
+                    vec_splat_float(n[2])*gz)*vangle;
+
+       result= (vresult.f[0] + vresult.f[1] + vresult.f[2] + vresult.f[3])*(0.5f/(float)M_PI);
+       result= MAX2(result, 0.0f);
+
+       return result;
+}
+
+#endif
+
+/* SSE optimization, acos code doesn't work */
+
+#if 0
+
+#include <xmmintrin.h>
+
+static __m128 sse_approx_acos(__m128 x)
+{
+       /* needs a better approximation than taylor expansion of acos, since that
+        * gives big erros for near 1.0 values, sqrt(2*x)*acos(1-x) should work
+        * better, see http://www.tom.womack.net/projects/sse-fast-arctrig.html */
+
+       return _mm_set_ps1(1.0f);
+}
+
+static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
+{
+       float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
+       float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
+       float fresult[4] __attribute__((aligned(16)));
+       __m128 qx, qy, qz, rx, ry, rz, rlen, srx, sry, srz, gx, gy, gz, glen, rcos, angle, aresult;
+
+       /* compute r */
+       qx = _mm_set_ps(q3[0], q2[0], q1[0], q0[0]);
+       qy = _mm_set_ps(q3[1], q2[1], q1[1], q0[1]);
+       qz = _mm_set_ps(q3[2], q2[2], q1[2], q0[2]);
+
+       rx = qx - _mm_set_ps1(p[0]);
+       ry = qy - _mm_set_ps1(p[1]);
+       rz = qz - _mm_set_ps1(p[2]);
+
+       /* normalize r */
+       rlen = _mm_rsqrt_ps(rx*rx + ry*ry + rz*rz + _mm_set_ps1(1e-16f));
+       rx = rx*rlen;
+       ry = ry*rlen;
+       rz = rz*rlen;
+
+       /* cross product */
+       srx = _mm_shuffle_ps(rx, rx, _MM_SHUFFLE(0,3,2,1));
+       sry = _mm_shuffle_ps(ry, ry, _MM_SHUFFLE(0,3,2,1));
+       srz = _mm_shuffle_ps(rz, rz, _MM_SHUFFLE(0,3,2,1));
+
+       gx = sry*rz - srz*ry;
+       gy = srz*rx - srx*rz;
+       gz = srx*ry - sry*rx;
+
+       /* normalize g */
+       glen = _mm_rsqrt_ps(gx*gx + gy*gy + gz*gz + _mm_set_ps1(1e-16f));
+       gx = gx*glen;
+       gy = gy*glen;
+       gz = gz*glen;
+
+       /* compute angle */
+       rcos = rx*srx + ry*sry + rz*srz;
+       rcos= _mm_max_ps(_mm_min_ps(rcos, _mm_set_ps1(1.0f)), _mm_set_ps1(-1.0f));
+
+       angle = sse_approx_cos(rcos);
+       aresult = (_mm_set_ps1(n[0])*gx + _mm_set_ps1(n[1])*gy + _mm_set_ps1(n[2])*gz)*angle;
+
+       /* sum together */
+    result= (fresult[0] + fresult[1] + fresult[2] + fresult[3])*(0.5f/(float)M_PI);
+       result= MAX2(result, 0.0f);
+
+       return result;
+}
+
+#endif
+
+static float saacosf(float fac)
+{
+       if(fac<= -1.0f) return (float)M_PI;
+       else if(fac>=1.0f) return 0.0f;
+       else return acos(fac); /* acosf(fac) */
+}
+
+static void normalizef(float *n)
+{
+       float d;
+       
+       d= INPR(n, n);
+
+       if(d > 1.0e-35F) {
+               d= 1.0f/sqrt(d); /* sqrtf(d) */
+
+               n[0] *= d; 
+               n[1] *= d; 
+               n[2] *= d;
+       } 
+}
+
+static float occ_quad_form_factor(float *p, float *n, float *q0, float *q1, float *q2, float *q3)
+{
+       float r0[3], r1[3], r2[3], r3[3], g0[3], g1[3], g2[3], g3[3];
+       float a1, a2, a3, a4, dot1, dot2, dot3, dot4, result;
+
+       VECSUB(r0, q0, p);
+       VECSUB(r1, q1, p);
+       VECSUB(r2, q2, p);
+       VECSUB(r3, q3, p);
+
+       normalizef(r0);
+       normalizef(r1);
+       normalizef(r2);
+       normalizef(r3);
+
+       Crossf(g0, r1, r0); normalizef(g0);
+       Crossf(g1, r2, r1); normalizef(g1);
+       Crossf(g2, r3, r2); normalizef(g2);
+       Crossf(g3, r0, r3); normalizef(g3);
+
+       a1= saacosf(INPR(r0, r1));
+       a2= saacosf(INPR(r1, r2));
+       a3= saacosf(INPR(r2, r3));
+       a4= saacosf(INPR(r3, r0));
+
+       dot1= INPR(n, g0);
+       dot2= INPR(n, g1);
+       dot3= INPR(n, g2);
+       dot4= INPR(n, g3);
+
+       result= (a1*dot1 + a2*dot2 + a3*dot3 + a4*dot4)*0.5f/(float)M_PI;
+       result= MAX2(result, 0.0f);
+
+       return result;
+}
+
+float occ_form_factor(OccFace *face, float *p, float *n)
+{
+       ObjectInstanceRen *obi;
+       VlakRen *vlr;
+       float v1[3], v2[3], v3[3], v4[3], q0[3], q1[3], q2[3], q3[3], contrib= 0.0f;
+
+       obi= &R.objectinstance[face->obi];
+       vlr= RE_findOrAddVlak(obi->obr, face->facenr);
+
+       VECCOPY(v1, vlr->v1->co);
+       VECCOPY(v2, vlr->v2->co);
+       VECCOPY(v3, vlr->v3->co);
+
+       if(obi->flag & R_TRANSFORMED) {
+               Mat4MulVecfl(obi->mat, v1);
+               Mat4MulVecfl(obi->mat, v2);
+               Mat4MulVecfl(obi->mat, v3);
+       }
+
+       if(occ_visible_quad(p, n, v1, v2, v3, q0, q1, q2, q3))
+               contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
+
+       if(vlr->v4) {
+               VECCOPY(v4, vlr->v4->co);
+               if(obi->flag & R_TRANSFORMED)
+                       Mat4MulVecfl(obi->mat, v4);
+
+               if(occ_visible_quad(p, n, v1, v3, v4, q0, q1, q2, q3))
+                       contrib += occ_quad_form_factor(p, n, q0, q1, q2, q3);
+       }
+
+       return contrib;
+}
+
+static void occ_lookup(OcclusionTree *tree, int thread, OccFace *exclude, float *pp, float *pn, float *occ, float *bentn)
+{
+       OccNode *node, **stack;
+       OccFace *face;
+       float resultocc, v[3], p[3], n[3], co[3];
+       float distfac, error, d2, weight, emitarea;
+       int b, totstack;
+
+       /* init variables */
+       VECCOPY(p, pp);
+       VECCOPY(n, pn);
+       VECADDFAC(p, p, n, 1e-4f);
+
+       if(bentn)
+               VECCOPY(bentn, n);
+       
+       error= tree->error;
+       distfac= tree->distfac;
+
+       resultocc= 0.0f;
+
+       /* init stack */
+       stack= tree->stack[thread];
+       stack[0]= tree->root;
+       totstack= 1;
+
+       while(totstack) {
+               /* pop point off the stack */
+               node= stack[--totstack];
+
+               VECSUB(v, node->co, p);
+               d2= INPR(v, v) + 1e-16f;
+               emitarea= MAX2(node->area, node->dco);
+
+               if(d2*error > emitarea) {
+                       /* accumulate occlusion from spherical harmonics */
+                       weight= occ_solid_angle(node, v, d2, n);
+                       weight *= node->occlusion;
+
+                       if(bentn) {
+                               Normalize(v);
+                               bentn[0] -= weight*v[0];
+                               bentn[1] -= weight*v[1];
+                               bentn[2] -= weight*v[2];
+                       }
+
+                       if(distfac != 0.0f)
+                               weight /= (1.0 + distfac*d2);
+
+                       resultocc += weight;
+               }
+               else {
+                       /* traverse into children */
+                       for(b=0; b<TOTCHILD; b++) {
+                               if(node->childflag & (1<<b)) {
+                                       face= tree->face+node->child[b].face;
+
+                                       /* accumulate occlusion with face form factor */
+                                       if(!exclude || !(face->obi == exclude->obi && face->facenr == exclude->facenr)) {
+                                               weight= occ_form_factor(face, p, n);
+                                               weight *= tree->occlusion[node->child[b].face];
+
+                                               if(bentn || distfac != 0.0f) {
+                                                       occ_face(face, co, NULL, NULL); 
+                                                       VECSUB(v, co, p);
+
+                                                       if(bentn) {
+                                                               Normalize(v);
+                                                               bentn[0] -= weight*v[0];
+                                                               bentn[1] -= weight*v[1];
+                                                               bentn[2] -= weight*v[2];
+                                                       }
+
+                                                       if(distfac != 0.0f) {
+                                                               d2= INPR(v, v) + 1e-16f;
+                                                               weight /= (1.0 + distfac*d2);
+                                                       }
+                                               }
+
+                                               resultocc += weight;
+
+                                       }
+                               }
+                               else if(node->child[b].node) {
+                                       /* push child on the stack */
+                                       stack[totstack++]= node->child[b].node;
+                               }
+                       }
+               }
+       }
+
+       if(occ) *occ= resultocc;
+}
+
+static void occ_compute_passes(Render *re, OcclusionTree *tree, int totpass)
+{
+       float *occ, co[3], n[3];
+       int pass, i;
+       
+       occ= MEM_callocN(sizeof(float)*tree->totface, "OcclusionPassOcc");
+
+       for(pass=0; pass<totpass; pass++) {
+               for(i=0; i<tree->totface; i++) {
+                       occ_face(&tree->face[i], co, n, NULL);
+                       VecMulf(n, -1.0f);
+                       VECADDFAC(co, co, n, 1e-8f);
+
+                       occ_lookup(tree, 0, &tree->face[i], co, n, &occ[i], NULL);
+                       if(re->test_break())
+                               break;
+               }
+
+               if(re->test_break())
+                       break;
+
+               for(i=0; i<tree->totface; i++) {
+                       tree->occlusion[i] -= occ[i]; //MAX2(1.0f-occ[i], 0.0f);
+                       if(tree->occlusion[i] < 0.0f)
+                               tree->occlusion[i]= 0.0f;
+               }
+
+               occ_sum_occlusion(tree, tree->root);
+       }
+
+       MEM_freeN(occ);
+}
+
+static void sample_occ_tree(Render *re, OcclusionTree *tree, OccFace *exclude, float *co, float *n, int thread, int onlyshadow, float *skycol)
+{
+       float nn[3], bn[3], dxyview[3], fac, occ, occlusion, correction;
+       int aocolor;
+
+       aocolor= re->wrld.aocolor;
+       if(onlyshadow)
+               aocolor= WO_AOPLAIN;
+
+       VECCOPY(nn, n);
+       VecMulf(nn, -1.0f);
+
+       occ_lookup(tree, thread, exclude, co, nn, &occ, (aocolor)? bn: NULL);
+
+       correction= re->wrld.ao_approx_correction;
+
+       occlusion= (1.0f-correction)*(1.0f-occ);
+       CLAMP(occlusion, 0.0f, 1.0f);
+       if(correction != 0.0f)
+               occlusion += correction*exp(-occ);
+
+       if(aocolor) {
+               /* sky shading using bent normal */
+               if(aocolor==WO_AOSKYCOL) {
+                       fac= 0.5*(1.0f+bn[0]*re->grvec[0]+ bn[1]*re->grvec[1]+ bn[2]*re->grvec[2]);
+                       skycol[0]= (1.0f-fac)*re->wrld.horr + fac*re->wrld.zenr;
+                       skycol[1]= (1.0f-fac)*re->wrld.horg + fac*re->wrld.zeng;
+                       skycol[2]= (1.0f-fac)*re->wrld.horb + fac*re->wrld.zenb;
+               }
+               else {  /* WO_AOSKYTEX */
+                       dxyview[0]= 1.0f;
+                       dxyview[1]= 1.0f;
+                       dxyview[2]= 0.0f;
+                       shadeSkyView(skycol, co, bn, dxyview);
+               }
+
+               VecMulf(skycol, occlusion);
+       }
+       else {
+               skycol[0]= occlusion;
+               skycol[1]= occlusion;
+               skycol[2]= occlusion;
+       }
+}
+
+/* ---------------------------- Caching ------------------------------- */
+
+static OcclusionCacheSample *find_occ_sample(OcclusionCache *cache, int x, int y)
+{
+       x -= cache->x;
+       y -= cache->y;
+
+       x /= cache->step;
+       y /= cache->step;
+       x *= cache->step;
+       y *= cache->step;
+
+       if(x < 0 || x >= cache->w || y < 0 || y >= cache->h)
+               return NULL;
+       else
+               return &cache->sample[y*cache->w + x];
+}
+
+static int sample_occ_cache(OcclusionTree *tree, float *co, float *n, int x, int y, int thread, float *col)
+{
+       OcclusionCache *cache;
+       OcclusionCacheSample *samples[4], *sample;
+       float wn[4], wz[4], wb[4], tx, ty, w, totw, mino, maxo;
+       float d[3], dist2;
+       int i, x1, y1, x2, y2;
+
+       if(!tree->cache)
+               return 0;
+       
+       /* first try to find a sample in the same pixel */
+       cache= &tree->cache[thread];
+
+       if(cache->sample && cache->step) {
+               sample= &cache->sample[(y-cache->y)*cache->w + (x-cache->x)];
+               if(sample->filled) {
+                       VECSUB(d, sample->co, co);
+                       dist2= INPR(d, d);
+                       if(dist2 < 0.5f*sample->dist2 && INPR(sample->n, n) > 0.98f) {
+                               VECCOPY(col, sample->col);
+                               return 1;
+                       }
+               }
+       }
+       else
+               return 0;
+
+       /* try to interpolate between 4 neighbouring pixels */
+       samples[0]= find_occ_sample(cache, x, y);
+       samples[1]= find_occ_sample(cache, x+cache->step, y);
+       samples[2]= find_occ_sample(cache, x, y+cache->step);
+       samples[3]= find_occ_sample(cache, x+cache->step, y+cache->step);
+
+       for(i=0; i<4; i++)
+               if(!samples[i] || !samples[i]->filled)
+                       return 0;
+
+       /* require intensities not being too different */
+       mino= MIN4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
+       maxo= MAX4(samples[0]->intensity, samples[1]->intensity, samples[2]->intensity, samples[3]->intensity);
+
+       if(maxo - mino > 0.05f)
+               return 0;
+
+       /* compute weighted interpolation between samples */
+       col[0]= col[1]= col[2]= 0.0f;
+       totw= 0.0f;
+
+       x1= samples[0]->x;
+       y1= samples[0]->y;
+       x2= samples[3]->x;
+       y2= samples[3]->y;
+
+       tx= (float)(x2 - x)/(float)(x2 - x1);
+       ty= (float)(y2 - y)/(float)(y2 - y1);
+
+       wb[3]= (1.0f-tx)*(1.0f-ty);
+       wb[2]= (tx)*(1.0f-ty);
+       wb[1]= (1.0f-tx)*(ty);
+       wb[0]= tx*ty;
+
+       for(i=0; i<4; i++) {
+               VECSUB(d, samples[i]->co, co);
+               dist2= INPR(d, d);
+
+               wz[i]= 1.0f; //(samples[i]->dist2/(1e-4f + dist2));
+               wn[i]= pow(INPR(samples[i]->n, n), 32.0f);
+
+               w= wb[i]*wn[i]*wz[i];
+
+               totw += w;
+               col[0] += w*samples[i]->col[0];
+               col[1] += w*samples[i]->col[1];
+               col[2] += w*samples[i]->col[2];
+       }
+
+       if(totw >= 0.9f) {
+               totw= 1.0f/totw;
+               col[0] *= totw;
+               col[1] *= totw;
+               col[2] *= totw;
+               return 1;
+       }
+
+       return 0;
+}
+
+static void sample_occ_cache_mesh(ShadeInput *shi)
+{
+       StrandRen *strand= shi->strand;
+       OcclusionCacheMesh *mesh= strand->buffer->occlusionmesh;
+       int *face, *index = RE_strandren_get_face(shi->obr, strand, 0);
+       float w[4], *co1, *co2, *co3, *co4;
+
+       if(mesh && index) {
+               face= mesh->face[*index];
+
+               co1= mesh->co[face[0]];
+               co2= mesh->co[face[1]];
+               co3= mesh->co[face[2]];
+               co4= (face[3])? mesh->co[face[3]]: NULL;
+
+               InterpWeightsQ3Dfl(co1, co2, co3, co4, strand->vert->co, w);
+
+               shi->ao[0]= shi->ao[1]= shi->ao[2]= 0.0f;
+               VECADDFAC(shi->ao, shi->ao, mesh->col[face[0]], w[0]);
+               VECADDFAC(shi->ao, shi->ao, mesh->col[face[1]], w[1]);
+               VECADDFAC(shi->ao, shi->ao, mesh->col[face[2]], w[2]);
+               if(face[3])
+                       VECADDFAC(shi->ao, shi->ao, mesh->col[face[3]], w[3]);
+       }
+       else {
+               shi->ao[0]= 1.0f;
+               shi->ao[1]= 1.0f;
+               shi->ao[2]= 1.0f;
+       }
+}
+
+void *cache_occ_mesh(Render *re, ObjectRen *obr, DerivedMesh *dm, float mat[][4])
+{
+       OcclusionCacheMesh *mesh;
+       MFace *mface;
+       MVert *mvert;
+       int a, totvert, totface;
+
+       totvert= dm->getNumVerts(dm);
+       totface= dm->getNumFaces(dm);
+
+       mesh= re->occlusionmesh.last;
+       if(mesh && mesh->obr->ob == obr->ob && mesh->obr->par == obr->par
+               && mesh->totvert==totvert && mesh->totface==totface)
+               return mesh;
+
+       mesh= MEM_callocN(sizeof(OcclusionCacheMesh), "OcclusionCacheMesh");
+       mesh->obr= obr;
+       mesh->totvert= totvert;
+       mesh->totface= totface;
+       mesh->co= MEM_callocN(sizeof(float)*3*mesh->totvert, "OcclusionMeshCo");
+       mesh->face= MEM_callocN(sizeof(int)*4*mesh->totface, "OcclusionMeshFaces");
+       mesh->col= MEM_callocN(sizeof(float)*3*mesh->totvert, "OcclusionMeshCol");
+
+       mvert= dm->getVertArray(dm);
+       for(a=0; a<mesh->totvert; a++, mvert++) {
+               VECCOPY(mesh->co[a], mvert->co);
+               Mat4MulVecfl(mat, mesh->co[a]);
+       }
+
+       mface= dm->getFaceArray(dm);
+       for(a=0; a<mesh->totface; a++, mface++) {
+               mesh->face[a][0]= mface->v1;
+               mesh->face[a][1]= mface->v2;
+               mesh->face[a][2]= mface->v3;
+               mesh->face[a][3]= mface->v4;
+       }
+
+       BLI_addtail(&re->occlusionmesh, mesh);
+
+       return mesh;
+}
+
+/* ------------------------- External Functions --------------------------- */
+
+void make_occ_tree(Render *re)
+{
+       OcclusionCacheMesh *mesh;
+       float col[3], co[3], n[3], *co1, *co2, *co3, *co4;
+       int a, *face, *count;
+
+       /* ugly, needed for occ_face */
+       R= *re;
+
+       re->i.infostr= "Occlusion preprocessing";
+       re->stats_draw(&re->i);
+       
+       re->occlusiontree= occ_tree_build(re);
+       
+       if(re->occlusiontree) {
+               if(re->wrld.ao_approx_passes)
+                       occ_compute_passes(re, re->occlusiontree, re->wrld.ao_approx_passes);
+
+               for(mesh=re->occlusionmesh.first; mesh; mesh=mesh->next) {
+                       count= MEM_callocN(sizeof(int)*mesh->totvert, "OcclusionCount");
+
+                       for(a=0; a<mesh->totface; a++) {
+                               face= mesh->face[a];
+                               co1= mesh->co[face[0]];
+                               co2= mesh->co[face[1]];
+                               co3= mesh->co[face[2]];
+
+                               if(face[3]) {
+                                       co4= mesh->co[face[3]];
+
+                                       VecLerpf(co, co1, co3, 0.5f);
+                                       CalcNormFloat4(co1, co2, co3, co4, n);
+                               }
+                               else {
+                                       CalcCent3f(co, co1, co2, co3);
+                                       CalcNormFloat(co1, co2, co3, n);
+                               }
+                               VecMulf(n, -1.0f);
+
+                               sample_occ_tree(re, re->occlusiontree, NULL, co, n, 0, 0, col);
+
+                               VECADD(mesh->col[face[0]], mesh->col[face[0]], col);
+                               count[face[0]]++;
+                               VECADD(mesh->col[face[1]], mesh->col[face[1]], col);
+                               count[face[1]]++;
+                               VECADD(mesh->col[face[2]], mesh->col[face[2]], col);
+                               count[face[2]]++;
+
+                               if(face[3]) {
+                                       VECADD(mesh->col[face[3]], mesh->col[face[3]], col);
+                                       count[face[3]]++;
+                               }
+                       }
+
+                       for(a=0; a<mesh->totvert; a++)
+                               if(count[a])
+                                       VecMulf(mesh->col[a], 1.0f/count[a]);
+
+                       MEM_freeN(count);
+               }
+       }
+}
+
+void free_occ(Render *re)
+{
+       OcclusionCacheMesh *mesh;
+
+       if(re->occlusiontree) {
+               occ_free_tree(re->occlusiontree);
+               re->occlusiontree = NULL;
+       }
+
+       for(mesh=re->occlusionmesh.first; mesh; mesh=mesh->next) {
+               if(mesh->co) MEM_freeN(mesh->co);
+               if(mesh->col) MEM_freeN(mesh->col);
+               if(mesh->face) MEM_freeN(mesh->face);
+       }
+
+       BLI_freelistN(&re->occlusionmesh);
+}
+
+void sample_occ(Render *re, ShadeInput *shi)
+{
+       OcclusionTree *tree= re->occlusiontree;
+       OcclusionCache *cache;
+       OcclusionCacheSample *sample;
+       OccFace exclude;
+       int onlyshadow;
+
+       if(tree) {
+               if(shi->strand) {
+                       sample_occ_cache_mesh(shi);
+               }
+               /* try to get result from the cache if possible */
+               else if(!sample_occ_cache(tree, shi->co, shi->vno, shi->xs, shi->ys, shi->thread, shi->ao)) {
+                       /* no luck, let's sample the occlusion */
+                       exclude.obi= shi->obi - re->objectinstance;
+                       exclude.facenr= shi->vlr->index;
+                       onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
+                       sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
+
+                       if(G.rt & 32)
+                               shi->ao[2] *= 2.0f;
+
+                       /* fill result into sample, each time */
+                       if(tree->cache) {
+                               cache= &tree->cache[shi->thread];
+
+                               if(cache->sample && cache->step) {
+                                       sample= &cache->sample[(shi->ys-cache->y)*cache->w + (shi->xs-cache->x)];
+                                       VECCOPY(sample->co, shi->co);
+                                       VECCOPY(sample->n, shi->vno);
+                                       VECCOPY(sample->col, shi->ao);
+                                       sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
+                                       sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
+                                       sample->filled= 1;
+                               }
+                       }
+               }
+       }
+       else {
+               shi->ao[0]= 1.0f;
+               shi->ao[1]= 1.0f;
+               shi->ao[2]= 1.0f;
+       }
+}
+
+void cache_occ_samples(Render *re, RenderPart *pa, ShadeSample *ssamp)
+{
+       OcclusionTree *tree= re->occlusiontree;
+       PixStr ps;
+       OcclusionCache *cache;
+       OcclusionCacheSample *sample;
+       OccFace exclude;
+       ShadeInput *shi;
+       int *ro, *rp, *rz, onlyshadow;
+       int x, y, step = CACHE_STEP;
+
+       if(!tree->cache)
+               return;
+
+       cache= &tree->cache[pa->thread];
+       cache->w= pa->rectx;
+       cache->h= pa->recty;
+       cache->x= pa->disprect.xmin;
+       cache->y= pa->disprect.ymin;
+       cache->step= step;
+       cache->sample= MEM_callocN(sizeof(OcclusionCacheSample)*cache->w*cache->h, "OcclusionCacheSample");
+       sample= cache->sample;
+
+       ps.next= NULL;
+       ps.mask= 0xFFFF;
+
+       ro= pa->recto;
+       rp= pa->rectp;
+       rz= pa->rectz;
+
+       /* compute a sample at every step pixels */
+       for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++) {
+               for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, ro++, rp++, rz++) {
+                       if((((x - pa->disprect.xmin + step) % step) == 0 || x == pa->disprect.xmax-1) && (((y - pa->disprect.ymin + step) % step) == 0 || y == pa->disprect.ymax-1)) {
+                               if(*rp) {
+                                       ps.obi= *ro;
+                                       ps.facenr= *rp;
+                                       ps.z= *rz;
+
+                                       shade_samples_fill_with_ps(ssamp, &ps, x, y);
+                                       shi= ssamp->shi;
+
+                                       onlyshadow= (shi->mat->mode & MA_ONLYSHADOW);
+                                       exclude.obi= shi->obi - re->objectinstance;
+                                       exclude.facenr= shi->vlr->index;
+                                       sample_occ_tree(re, tree, &exclude, shi->co, shi->vno, shi->thread, onlyshadow, shi->ao);
+
+                                       VECCOPY(sample->co, shi->co);
+                                       VECCOPY(sample->n, shi->vno);
+                                       VECCOPY(sample->col, shi->ao);
+                                       sample->intensity= MAX3(sample->col[0], sample->col[1], sample->col[2]);
+                                       sample->dist2= INPR(shi->dxco, shi->dxco) + INPR(shi->dyco, shi->dyco);
+                                       sample->x= shi->xs;
+                                       sample->y= shi->ys;
+                                       sample->filled= 1;
+                               }
+
+                               if(re->test_break())
+                                       break;
+                       }
+
+                       sample++;
+               }
+       }
+}
+
+void free_occ_samples(Render *re, RenderPart *pa)
+{
+       OcclusionTree *tree= re->occlusiontree;
+       OcclusionCache *cache;
+
+       if(tree->cache) {
+               cache= &tree->cache[pa->thread];
+
+               if(cache->sample)
+                       MEM_freeN(cache->sample);
+
+               cache->w= 0;
+               cache->h= 0;
+               cache->step= 0;
+       }
+}
+
index 0f9d44701641bd0ec2a8667798b23783af07b25c..2c03b0ae6c7bcf627e17d7b897a98c897bbc1f26 100644 (file)
@@ -538,6 +538,9 @@ static RenderResult *new_render_result(Render *re, rcti *partrct, int crop, int
                        if(srl->passflag  & SCE_PASS_REFRACT)
                                render_layer_add_pass(rr, rl, 3, SCE_PASS_REFRACT);
                }
+               else if(re->wrld.mode & WO_AMB_OCC)
+                       if(re->wrld.ao_gather_method == WO_AOGATHER_APPROX)
+                               render_layer_add_pass(rr, rl, 3, SCE_PASS_AO);
                if(re->r.mode & R_RADIO)
                        if(srl->passflag  & SCE_PASS_RADIO)
                                render_layer_add_pass(rr, rl, 3, SCE_PASS_RADIO);
index e2f331afafdb8589fff6822afc1b4383745f0be2..d0ea69dc833ec1182d895e78029c786210384050 100644 (file)
@@ -60,6 +60,7 @@
 #include "renderpipeline.h"
 #include "render_types.h"
 #include "renderdatabase.h"
+#include "occlusion.h"
 #include "pixelblending.h"
 #include "pixelshading.h"
 #include "shadbuf.h"
@@ -534,7 +535,11 @@ static void shadeDA_tile(RenderPart *pa, RenderLayer *rl)
        /* general shader info, passes */
        shade_sample_initialize(&ssamp, pa, rl);
        addpassflag= rl->passflag & ~(SCE_PASS_Z|SCE_PASS_COMBINED);
-                               
+
+       /* occlusion caching */
+       if(R.occlusiontree)
+               cache_occ_samples(&R, pa, &ssamp);
+               
        /* filtered render, for now we assume only 1 filter size */
        if(pa->crop) {
                crop= 1;
@@ -582,6 +587,9 @@ static void shadeDA_tile(RenderPart *pa, RenderLayer *rl)
        
        if(R.r.mode & R_SHADOW)
                ISB_free(pa);
+
+       if(R.occlusiontree)
+               free_occ_samples(&R, pa);
 }
 
 /* ************* pixel struct ******** */
@@ -1072,6 +1080,9 @@ void zbufshade_tile(RenderPart *pa)
                                /* irregular shadowb buffer creation */
                                if(R.r.mode & R_SHADOW)
                                        ISB_create(pa, NULL);
+
+                               if(R.occlusiontree)
+                                       cache_occ_samples(&R, pa, &ssamp);
                                
                                for(y=pa->disprect.ymin; y<pa->disprect.ymax; y++, rr->renrect.ymax++) {
                                        for(x=pa->disprect.xmin; x<pa->disprect.xmax; x++, ro++, rz++, rp++, fcol+=4, offs++) {
@@ -1085,9 +1096,6 @@ void zbufshade_tile(RenderPart *pa)
                                                        if(shade_samples(&ssamp, &ps, x, y)) {
                                                                QUATCOPY(fcol, ssamp.shr[0].combined);
 
-                                                               if(!(fcol[0] == fcol[0]))
-                                                                       printvecf("fudgecol", fcol);
-                                                               
                                                                /* passes */
                                                                if(addpassflag)
                                                                        add_passes(rl, offs, ssamp.shi, ssamp.shr);
@@ -1098,6 +1106,9 @@ void zbufshade_tile(RenderPart *pa)
                                                if(R.test_break()) break; 
                                }
                                
+                               if(R.occlusiontree)
+                                       free_occ_samples(&R, pa);
+                               
                                if(R.r.mode & R_SHADOW)
                                        ISB_free(pa);
                        }
index 58a89809b83ab2e333677c8526a2f569f9dc3af9..8aaf29b65c4f9a135907187abf1b52de33cfcedb 100644 (file)
 #define RE_UV_ELEMS                    2
 #define RE_SURFNOR_ELEMS       3
 #define RE_SIMPLIFY_ELEMS      2
+#define RE_FACE_ELEMS  1
 
 float *RE_vertren_get_sticky(ObjectRen *obr, VertRen *ver, int verify)
 {
@@ -607,6 +608,21 @@ float *RE_strandren_get_simplify(struct ObjectRen *obr, struct StrandRen *strand
        return simplify + (strand->index & 255)*RE_SIMPLIFY_ELEMS;
 }
 
+int *RE_strandren_get_face(ObjectRen *obr, StrandRen *strand, int verify)
+{
+       int *face;
+       int nr= strand->index>>8;
+       
+       face= obr->strandnodes[nr].face;
+       if(face==NULL) {
+               if(verify) 
+                       face= obr->strandnodes[nr].face= MEM_callocN(256*RE_FACE_ELEMS*sizeof(int), "face table");
+               else
+                       return NULL;
+       }
+       return face + (strand->index & 255)*RE_FACE_ELEMS;
+}
+
 /* winspeed is exception, it is stored per instance */
 float *RE_strandren_get_winspeed(ObjectInstanceRen *obi, StrandRen *strand, int verify)
 {
@@ -757,6 +773,8 @@ void free_renderdata_strandnodes(StrandTableNode *strandnodes)
                        MEM_freeN(strandnodes[a].surfnor);
                if(strandnodes[a].simplify)
                        MEM_freeN(strandnodes[a].simplify);
+               if(strandnodes[a].face)
+                       MEM_freeN(strandnodes[a].face);
        }
        
        MEM_freeN(strandnodes);
index d862729317b7df0b1ab6bda916657574da5fea06..586d54aa04b2c44d8c1d43494222477424376486 100644 (file)
@@ -422,7 +422,7 @@ void shade_input_set_strand_texco(ShadeInput *shi, StrandRen *strand, StrandVert
                }
 
                if(texco & TEXCO_STRAND) {
-                       shi->strand= spoint->strandco;
+                       shi->strandco= spoint->strandco;
 
                        if(shi->osatex) {
                                shi->dxstrand= spoint->dtstrandco;
@@ -935,7 +935,7 @@ void shade_input_set_shade_texco(ShadeInput *shi)
                }
                
                if(texco & TEXCO_STRAND) {
-                       shi->strand= (l*v3->accum - u*v1->accum - v*v2->accum);
+                       shi->strandco= (l*v3->accum - u*v1->accum - v*v2->accum);
                        if(shi->osatex) {
                                dl= shi->dx_u+shi->dx_v;
                                shi->dxstrand= dl*v3->accum-shi->dx_u*v1->accum-shi->dx_v*v2->accum;
@@ -1236,7 +1236,7 @@ void shade_samples_do_AO(ShadeSample *ssamp)
        
        if(!(R.r.mode & R_SHADOW))
                return;
-       if(!(R.r.mode & R_RAYTRACE))
+       if(!(R.r.mode & R_RAYTRACE) && !(R.wrld.ao_gather_method == WO_AOGATHER_APPROX))
                return;
        
        if(R.wrld.mode & WO_AMB_OCC)
@@ -1248,7 +1248,7 @@ void shade_samples_do_AO(ShadeSample *ssamp)
 }
 
 
-static void shade_samples_fill_with_ps(ShadeSample *ssamp, PixStr *ps, int x, int y)
+void shade_samples_fill_with_ps(ShadeSample *ssamp, PixStr *ps, int x, int y)
 {
        ShadeInput *shi;
        float xs, ys;
index ca81d456487b9f355b2e6a991f0d00fbe5f7660c..0ba7c09cdd8c53c179beb1d087d39ad701f0b011 100644 (file)
@@ -43,6 +43,7 @@
 #include "DNA_material_types.h"
 
 /* local include */
+#include "occlusion.h"
 #include "renderpipeline.h"
 #include "render_types.h"
 #include "pixelblending.h"
@@ -1006,8 +1007,9 @@ static void do_specular_ramp(ShadeInput *shi, float is, float t, float *spec)
 /* pure AO, check for raytrace and world should have been done */
 void ambient_occlusion(ShadeInput *shi)
 {
-       
-       if((R.r.mode & R_RAYTRACE) && shi->mat->amb!=0.0f)
+       if((R.wrld.ao_gather_method == WO_AOGATHER_APPROX) && shi->mat->amb!=0.0f)
+               sample_occ(&R, shi);
+       else if((R.r.mode & R_RAYTRACE) && shi->mat->amb!=0.0f)
                ray_ao(shi, shi->ao);
        else
                shi->ao[0]= shi->ao[1]= shi->ao[2]= 1.0f;
@@ -1017,25 +1019,28 @@ void ambient_occlusion(ShadeInput *shi)
 /* wrld mode was checked for */
 void ambient_occlusion_to_diffuse(ShadeInput *shi, float *diff)
 {
-       
-       if((R.r.mode & R_RAYTRACE) && shi->mat->amb!=0.0f) {
-               float f= R.wrld.aoenergy*shi->mat->amb;
-
-               if (R.wrld.aomix==WO_AOADDSUB) {
-                       diff[0] = 2.0f*shi->ao[0]-1.0f;
-                       diff[1] = 2.0f*shi->ao[1]-1.0f;
-                       diff[2] = 2.0f*shi->ao[2]-1.0f;
-               }
-               else if (R.wrld.aomix==WO_AOSUB) {
-                       diff[0] = shi->ao[0]-1.0f;
-                       diff[1] = shi->ao[1]-1.0f;
-                       diff[2] = shi->ao[2]-1.0f;
-               }
-               else {
-                       VECCOPY(diff, shi->ao);
+       if((R.r.mode & R_RAYTRACE) || R.wrld.ao_gather_method == WO_AOGATHER_APPROX) {
+               if(shi->mat->amb!=0.0f) {
+                       float f= R.wrld.aoenergy*shi->mat->amb;
+
+                       if (R.wrld.aomix==WO_AOADDSUB) {
+                               diff[0] = 2.0f*shi->ao[0]-1.0f;
+                               diff[1] = 2.0f*shi->ao[1]-1.0f;
+                               diff[2] = 2.0f*shi->ao[2]-1.0f;
+                       }
+                       else if (R.wrld.aomix==WO_AOSUB) {
+                               diff[0] = shi->ao[0]-1.0f;
+                               diff[1] = shi->ao[1]-1.0f;
+                               diff[2] = shi->ao[2]-1.0f;
+                       }
+                       else {
+                               VECCOPY(diff, shi->ao);
+                       }
+                       
+                       VECMUL(diff, f);
                }
-               
-               VECMUL(diff, f);
+               else
+                       diff[0]= diff[1]= diff[2]= 0.0f;
        }
        else
                diff[0]= diff[1]= diff[2]= 0.0f;
index 692c86a68cf936fd6a52ca6dd14c60d66068934f..ccb3b3ee0f4de4c5b26e354c6e91a53e1f30e691 100644 (file)
@@ -685,6 +685,7 @@ static void strand_shade_point(Render *re, ShadeSample *ssamp, StrandSegment *ss
                vlr.flag |= R_TANGENT;
 
        shi->vlr= &vlr;
+       shi->strand= sseg->strand;
        shi->obi= sseg->obi;
        shi->obr= sseg->obi->obr;
 
index a4182637f0278fb7b2c5c9682918d0fb7c0c7d99..be248ddda3f89d530650e13581a85eae44eb395f 100644 (file)
@@ -1521,7 +1521,7 @@ void do_material_tex(ShadeInput *shi)
                        }
                        else if(mtex->texco==TEXCO_STRAND) {
                                co= tempvec; dx= dxt; dy= dyt;
-                               co[0]= shi->strand;
+                               co[0]= shi->strandco;
                                co[1]= co[2]= 0.0f;
                                dx[0]= shi->dxstrand;
                                dx[1]= dx[2]= 0.0f;
index c7ba1430748c8b59ca581afdf57a8a1db7c797d7..d21d4f3c8dec5bf102448254235c97ccb960b160 100644 (file)
@@ -2152,18 +2152,27 @@ static void world_panel_amb_occ(World *wrld)
        uiBlockSetCol(block, TH_AUTO);
        
        if(!(wrld->mode & WO_AMB_OCC)) return;
-       
-       yco -= YSPACE;
 
-       uiDefButS(block, NUM, B_REDR, "Samples:",
-               X2CLM1, yco-=BUTH, BUTW2, BUTH, &wrld->aosamp, 1.0, 32.0, 100, 0, "Sets the number of samples used for AO  (actual number: squared)");
-       
        yco -= YSPACE;
-       
-       uiDefButF(block, NUM, B_REDR, "Max Dist:",
-               X2CLM1, yco-=BUTH, BUTW2, BUTH, &wrld->aodist, 0.001, 5000.0, 100, 0, "Sets length of AO rays, defines how far away other faces give occlusion effect");
 
-       yco -= YSPACE;
+       if(wrld->ao_gather_method == WO_AOGATHER_RAYTRACE) {
+               uiDefButS(block, NUM, B_REDR, "Samples:",
+                       X2CLM1, yco-=BUTH, BUTW2, BUTH, &wrld->aosamp, 1.0, 32.0, 100, 0, "Sets the number of samples used for AO  (actual number: squared)");
+
+               yco -= YSPACE;
+               
+               uiDefButF(block, NUM, B_REDR, "Max Dist:",
+                       X2CLM1, yco-=BUTH, BUTW2, BUTH, &wrld->aodist, 0.001, 5000.0, 100, 0, "Sets length of AO rays, defines how far away other faces give occlusion effect");
+       }
+       else {
+               uiDefButS(block, NUM, B_REDR, "Passes:",
+                       X2CLM1, yco-=BUTH, BUTW2, BUTH, &wrld->ao_approx_passes, 0.0, 10.0, 0, 0, "Sets the number of preprocessing passes to reduce overocclusion");
+
+               yco -= YSPACE;
+       
+               uiDefButF(block, NUM, B_REDR, "Correction:",
+                       X2CLM1, yco-=BUTH, BUTW2, BUTH, &wrld->ao_approx_correction, 0.0, 1.0, 0, 0, "Ad-hoc correction for over-occlusion due to the approximation.");
+       }
        
        uiBlockBeginAlign(block);
        uiDefButBitS(block, TOG, WO_AODIST, B_AO_FALLOFF, "Use Falloff",
@@ -2176,24 +2185,39 @@ static void world_panel_amb_occ(World *wrld)
        /* column 2 */
        yco = PANEL_YMAX - BUTH - YSPACE;
        
-       uiDefButS(block, MENU, B_REDR, "Constant QMC %x2|Adaptive QMC %x1|Constant Jittered %x0",
-               X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->ao_samp_method, 0, 0, 0, 0, "Method for generating shadow samples: Constant QMC: best quality, Adaptive QMC: fast in high contrast areas");
-       
+       uiDefButS(block, MENU, B_REDR, "Gather Method%t|Raytrace %x0|Approximate %x1",
+               X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->ao_gather_method, 0, 0, 0, 0, "Method for occlusion gathering: Raytrace: slow when noise free results are required, but accurate, Approximate: faster and without noise, but inaccurate");
+
        yco -= YSPACE;
-       
-       if (wrld->ao_samp_method == WO_AOSAMP_HALTON) { 
+
+       if(wrld->ao_gather_method == WO_AOGATHER_RAYTRACE) {
+               uiDefButS(block, MENU, B_REDR, "Constant QMC %x2|Adaptive QMC %x1|Constant Jittered %x0",
+                       X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->ao_samp_method, 0, 0, 0, 0, "Method for generating shadow samples: Constant QMC: best quality, Adaptive QMC: fast in high contrast areas");
+               
+               yco -= YSPACE;
+
+               if (wrld->ao_samp_method == WO_AOSAMP_HALTON) { 
+                       uiBlockBeginAlign(block);
+                       uiDefButF(block, NUM, B_REDR, "Threshold:",
+                               X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->ao_adapt_thresh, 0.0, 1.0, 100, 0, "Samples below this threshold will be considered fully shadowed/unshadowed and skipped");
+                       uiDefButF(block, NUMSLI, B_REDR, "Adapt Vec:",
+                               X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->ao_adapt_speed_fac, 0.0, 1.0, 100, 0, "Use the speed vector pass to reduce AO samples in fast moving pixels. The higher the value, the more aggressive the sample reduction. Requires Vec pass enabled.");
+                       uiBlockEndAlign(block);
+               } else if (wrld->ao_samp_method == WO_AOSAMP_CONSTANT) {
+                       uiDefButF(block, NUMSLI, B_REDR, "Bias:",
+                               X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->aobias, 0.0, 0.5, 10, 0, "Sets bias to prevent smoothed faces to show banding (in radians)");
+               }
+       }
+       else {
                uiBlockBeginAlign(block);
-               uiDefButF(block, NUM, B_REDR, "Threshold:",
-                       X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->ao_adapt_thresh, 0.0, 1.0, 100, 0, "Samples below this threshold will be considered fully shadowed/unshadowed and skipped");
-               uiDefButF(block, NUMSLI, B_REDR, "Adapt Vec:",
-                       X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->ao_adapt_speed_fac, 0.0, 1.0, 100, 0, "Use the speed vector pass to reduce AO samples in fast moving pixels. The higher the value, the more aggressive the sample reduction. Requires Vec pass enabled.");
+               uiDefButF(block, NUM, B_REDR, "Error:",
+                       X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->ao_approx_error, 0.0001, 10.0, 0, 0, "Error tolerance (low values are slower and higher quality)");
+
+               uiDefButBitS(block, TOG, WO_AOCACHE, B_REDR, "Pixel Cache",
+                       X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->aomode, 0, 0, 0, 0, "Cache AO results in pixels and interpolate over neighbouring pixels for speedup.");
                uiBlockEndAlign(block);
-       } else if (wrld->ao_samp_method == WO_AOSAMP_CONSTANT) {
-               uiDefButF(block, NUMSLI, B_REDR, "Bias:",
-                       X2CLM2, yco-=BUTH, BUTW2, BUTH, &wrld->aobias, 0.0, 0.5, 10, 0, "Sets bias to prevent smoothed faces to show banding (in radians)");
        }
 
-
        yco = PANEL_YMAX - (5*BUTH+4*YSPACE);
 
        /* result mix modes */
@@ -2204,6 +2228,7 @@ static void world_panel_amb_occ(World *wrld)
                X3CLM2, yco, BUTW3, BUTH, &wrld->aomix, 1.0, (float)WO_AOSUB, 0, 0, "subtracts light/shadows (needs at least one normal light to make anything visible)");
        uiDefButS(block, ROW, B_REDR, "Both",
                X3CLM3, yco, BUTW3, BUTH, &wrld->aomix, 1.0, (float)WO_AOADDSUB, 0, 0, "both lightens & darkens");
+       uiBlockEndAlign(block);
 
        yco -= YSPACE;
 
@@ -2216,12 +2241,11 @@ static void world_panel_amb_occ(World *wrld)
        uiDefButS(block, ROW, B_REDR, "Sky Texture", 
                X3CLM3, yco, BUTW3, BUTH, &wrld->aocolor, 2.0, (float)WO_AOSKYTEX, 0, 0, "Does full Sky texture render for diffuse energy");
        uiBlockEndAlign(block);
-       
+
        yco -= YSPACE;
-       
+               
        uiDefButF(block, NUMSLI, B_REDR, "Energy:",
                X2CLM1, yco-=BUTH, BUTW2, BUTH, &wrld->aoenergy, 0.01, 3.0, 100, 0, "Sets global energy scale for AO");
-       
 }
 
 static void world_panel_world(World *wrld)