Filling in branch from trunk
[blender.git] / source / blender / nodes / intern / CMP_util.c
1 /**
2  * $Id$
3  *
4  * ***** BEGIN GPL LICENSE BLOCK *****
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version. 
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software Foundation,
18  * Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19  *
20  * The Original Code is Copyright (C) 2006 Blender Foundation.
21  * All rights reserved.
22  *
23  * The Original Code is: all of this file.
24  *
25  * Contributor(s): none yet.
26  *
27  * ***** END GPL LICENSE BLOCK *****
28  */
29
30 #include "CMP_util.h"
31
32 CompBuf *alloc_compbuf(int sizex, int sizey, int type, int alloc)
33 {
34         CompBuf *cbuf= MEM_callocN(sizeof(CompBuf), "compbuf");
35         
36         cbuf->x= sizex;
37         cbuf->y= sizey;
38         cbuf->xrad= sizex/2;
39         cbuf->yrad= sizey/2;
40         
41         cbuf->type= type;
42         if(alloc) {
43                 if(cbuf->type==CB_RGBA)
44                         cbuf->rect= MEM_mapallocN(4*sizeof(float)*sizex*sizey, "compbuf RGBA rect");
45                 else if(cbuf->type==CB_VEC3)
46                         cbuf->rect= MEM_mapallocN(3*sizeof(float)*sizex*sizey, "compbuf Vector3 rect");
47                 else if(cbuf->type==CB_VEC2)
48                         cbuf->rect= MEM_mapallocN(2*sizeof(float)*sizex*sizey, "compbuf Vector2 rect");
49                 else
50                         cbuf->rect= MEM_mapallocN(sizeof(float)*sizex*sizey, "compbuf Fac rect");
51                 cbuf->malloc= 1;
52         }
53         cbuf->disprect.xmin= 0;
54         cbuf->disprect.ymin= 0;
55         cbuf->disprect.xmax= sizex;
56         cbuf->disprect.ymax= sizey;
57         
58         return cbuf;
59 }
60
61 CompBuf *dupalloc_compbuf(CompBuf *cbuf)
62 {
63         CompBuf *dupbuf= alloc_compbuf(cbuf->x, cbuf->y, cbuf->type, 1);
64         if(dupbuf) {
65                 memcpy(dupbuf->rect, cbuf->rect, cbuf->type*sizeof(float)*cbuf->x*cbuf->y);
66         
67                 dupbuf->xof= cbuf->xof;
68                 dupbuf->yof= cbuf->yof;
69         }       
70         return dupbuf;
71 }
72
73 /* instead of reference counting, we create a list */
74 CompBuf *pass_on_compbuf(CompBuf *cbuf)
75 {
76         CompBuf *dupbuf= alloc_compbuf(cbuf->x, cbuf->y, cbuf->type, 0);
77         CompBuf *lastbuf;
78         
79         if(dupbuf) {
80                 dupbuf->rect= cbuf->rect;
81                 dupbuf->xof= cbuf->xof;
82                 dupbuf->yof= cbuf->yof;
83                 dupbuf->malloc= 0;
84                 
85                 /* get last buffer in list, and append dupbuf */
86                 for(lastbuf= cbuf; lastbuf; lastbuf= lastbuf->next)
87                         if(lastbuf->next==NULL)
88                                 break;
89                 lastbuf->next= dupbuf;
90                 dupbuf->prev= lastbuf;
91         }       
92         return dupbuf;
93 }
94
95
96 void free_compbuf(CompBuf *cbuf)
97 {
98         /* check referencing, then remove from list and set malloc tag */
99         if(cbuf->prev || cbuf->next) {
100                 if(cbuf->prev)
101                         cbuf->prev->next= cbuf->next;
102                 if(cbuf->next)
103                         cbuf->next->prev= cbuf->prev;
104                 if(cbuf->malloc) {
105                         if(cbuf->prev)
106                                 cbuf->prev->malloc= 1;
107                         else
108                                 cbuf->next->malloc= 1;
109                         cbuf->malloc= 0;
110                 }
111         }
112         
113         if(cbuf->malloc && cbuf->rect)
114                 MEM_freeN(cbuf->rect);
115
116         MEM_freeN(cbuf);
117 }
118
119 void print_compbuf(char *str, CompBuf *cbuf)
120 {
121         printf("Compbuf %s %d %d %p\n", str, cbuf->x, cbuf->y, cbuf->rect);
122         
123 }
124
125 CompBuf *get_cropped_compbuf(rcti *drect, float *rectf, int rectx, int recty, int type)
126 {
127         CompBuf *cbuf;
128         rcti disprect= *drect;
129         float *outfp;
130         int dx, y;
131         
132         if(disprect.xmax>rectx) disprect.xmax= rectx;
133         if(disprect.ymax>recty) disprect.ymax= recty;
134         if(disprect.xmin>= disprect.xmax) return NULL;
135         if(disprect.ymin>= disprect.ymax) return NULL;
136         
137         cbuf= alloc_compbuf(disprect.xmax-disprect.xmin, disprect.ymax-disprect.ymin, type, 1);
138         outfp= cbuf->rect;
139         rectf += type*(disprect.ymin*rectx + disprect.xmin);
140         dx= type*cbuf->x;
141         for(y=cbuf->y; y>0; y--, outfp+=dx, rectf+=type*rectx)
142                 memcpy(outfp, rectf, sizeof(float)*dx);
143         
144         return cbuf;
145 }
146
147 CompBuf *scalefast_compbuf(CompBuf *inbuf, int newx, int newy)
148 {
149         CompBuf *outbuf; 
150         float *rectf, *newrectf, *rf;
151         int x, y, c, pixsize= inbuf->type;
152         int ofsx, ofsy, stepx, stepy;
153         
154         if(inbuf->x==newx && inbuf->y==newy)
155                 return dupalloc_compbuf(inbuf);
156         
157         outbuf= alloc_compbuf(newx, newy, inbuf->type, 1);
158         newrectf= outbuf->rect;
159         
160         stepx = (65536.0 * (inbuf->x - 1.0) / (newx - 1.0)) + 0.5;
161         stepy = (65536.0 * (inbuf->y - 1.0) / (newy - 1.0)) + 0.5;
162         ofsy = 32768;
163         
164         for (y = newy; y > 0 ; y--){
165                 rectf = inbuf->rect;
166                 rectf += pixsize * (ofsy >> 16) * inbuf->x;
167
168                 ofsy += stepy;
169                 ofsx = 32768;
170                 
171                 for (x = newx ; x>0 ; x--) {
172                         
173                         rf= rectf + pixsize*(ofsx >> 16);
174                         for(c=0; c<pixsize; c++)
175                                 newrectf[c] = rf[c];
176                         
177                         newrectf+= pixsize;
178                         
179                         ofsx += stepx;
180                 }
181         }
182         
183         return outbuf;
184 }
185
186 CompBuf *typecheck_compbuf(CompBuf *inbuf, int type)
187 {
188         if(inbuf && inbuf->type!=type && inbuf->rect_procedural==NULL) {
189                 CompBuf *outbuf= alloc_compbuf(inbuf->x, inbuf->y, type, 1); 
190                 float *inrf= inbuf->rect;
191                 float *outrf= outbuf->rect;
192                 int x= inbuf->x*inbuf->y;
193                 
194                 /* warning note: xof and yof are applied in pixelprocessor, but should be copied otherwise? */
195                 outbuf->xof= inbuf->xof;
196                 outbuf->yof= inbuf->yof;
197                 
198                 if(type==CB_VAL) {
199                         if(inbuf->type==CB_VEC2) {
200                                 for(; x>0; x--, outrf+= 1, inrf+= 2)
201                                         *outrf= 0.5f*(inrf[0]+inrf[1]);
202                         }
203                         else if(inbuf->type==CB_VEC3) {
204                                 for(; x>0; x--, outrf+= 1, inrf+= 3)
205                                         *outrf= 0.333333f*(inrf[0]+inrf[1]+inrf[2]);
206                         }
207                         else if(inbuf->type==CB_RGBA) {
208                                 for(; x>0; x--, outrf+= 1, inrf+= 4)
209                                         *outrf= inrf[0]*0.35f + inrf[1]*0.45f + inrf[2]*0.2f;
210                         }
211                 }
212                 else if(type==CB_VEC2) {
213                         if(inbuf->type==CB_VAL) {
214                                 for(; x>0; x--, outrf+= 2, inrf+= 1) {
215                                         outrf[0]= inrf[0];
216                                         outrf[1]= inrf[0];
217                                 }
218                         }
219                         else if(inbuf->type==CB_VEC3) {
220                                 for(; x>0; x--, outrf+= 2, inrf+= 3) {
221                                         outrf[0]= inrf[0];
222                                         outrf[1]= inrf[1];
223                                 }
224                         }
225                         else if(inbuf->type==CB_RGBA) {
226                                 for(; x>0; x--, outrf+= 2, inrf+= 4) {
227                                         outrf[0]= inrf[0];
228                                         outrf[1]= inrf[1];
229                                 }
230                         }
231                 }
232                 else if(type==CB_VEC3) {
233                         if(inbuf->type==CB_VAL) {
234                                 for(; x>0; x--, outrf+= 3, inrf+= 1) {
235                                         outrf[0]= inrf[0];
236                                         outrf[1]= inrf[0];
237                                         outrf[2]= inrf[0];
238                                 }
239                         }
240                         else if(inbuf->type==CB_VEC2) {
241                                 for(; x>0; x--, outrf+= 3, inrf+= 2) {
242                                         outrf[0]= inrf[0];
243                                         outrf[1]= inrf[1];
244                                         outrf[2]= 0.0f;
245                                 }
246                         }
247                         else if(inbuf->type==CB_RGBA) {
248                                 for(; x>0; x--, outrf+= 3, inrf+= 4) {
249                                         outrf[0]= inrf[0];
250                                         outrf[1]= inrf[1];
251                                         outrf[2]= inrf[2];
252                                 }
253                         }
254                 }
255                 else if(type==CB_RGBA) {
256                         if(inbuf->type==CB_VAL) {
257                                 for(; x>0; x--, outrf+= 4, inrf+= 1) {
258                                         outrf[0]= inrf[0];
259                                         outrf[1]= inrf[0];
260                                         outrf[2]= inrf[0];
261                                         outrf[3]= 1.0f;
262                                 }
263                         }
264                         else if(inbuf->type==CB_VEC2) {
265                                 for(; x>0; x--, outrf+= 4, inrf+= 2) {
266                                         outrf[0]= inrf[0];
267                                         outrf[1]= inrf[1];
268                                         outrf[2]= 0.0f;
269                                         outrf[3]= 1.0f;
270                                 }
271                         }
272                         else if(inbuf->type==CB_VEC3) {
273                                 for(; x>0; x--, outrf+= 4, inrf+= 3) {
274                                         outrf[0]= inrf[0];
275                                         outrf[1]= inrf[1];
276                                         outrf[2]= inrf[2];
277                                         outrf[3]= 1.0f;
278                                 }
279                         }
280                 }
281                 
282                 return outbuf;
283         }
284         return inbuf;
285 }
286
287 float *compbuf_get_pixel(CompBuf *cbuf, float *rectf, int x, int y, int xrad, int yrad)
288 {
289         if(cbuf) {
290                 if(cbuf->rect_procedural) {
291                         cbuf->rect_procedural(cbuf, rectf, (float)x/(float)xrad, (float)y/(float)yrad);
292                         return rectf;
293                 }
294                 else {
295                         static float col[4]= {0.0f, 0.0f, 0.0f, 0.0f};
296                         
297                         /* map coords */
298                         x-= cbuf->xof;
299                         y-= cbuf->yof;
300                         
301                         if(y<-cbuf->yrad || y>= -cbuf->yrad+cbuf->y) return col;
302                         if(x<-cbuf->xrad || x>= -cbuf->xrad+cbuf->x) return col;
303                         
304                         return cbuf->rect + cbuf->type*( (cbuf->yrad+y)*cbuf->x + (cbuf->xrad+x) );
305                 }
306         }
307         else return rectf;
308 }
309
310 /* **************************************************** */
311
312 /* Pixel-to-Pixel operation, 1 Image in, 1 out */
313 void composit1_pixel_processor(bNode *node, CompBuf *out, CompBuf *src_buf, float *src_col,
314                                                                           void (*func)(bNode *, float *, float *), 
315                                                                           int src_type)
316 {
317         CompBuf *src_use;
318         float *outfp=out->rect, *srcfp;
319         int xrad, yrad, x, y;
320         
321         src_use= typecheck_compbuf(src_buf, src_type);
322         
323         xrad= out->xrad;
324         yrad= out->yrad;
325         
326         for(y= -yrad; y<-yrad+out->y; y++) {
327                 for(x= -xrad; x<-xrad+out->x; x++, outfp+=out->type) {
328                         srcfp= compbuf_get_pixel(src_use, src_col, x, y, xrad, yrad);
329                         func(node, outfp, srcfp);
330                 }
331         }
332         
333         if(src_use!=src_buf)
334                 free_compbuf(src_use);
335 }
336
337 /* Pixel-to-Pixel operation, 2 Images in, 1 out */
338 void composit2_pixel_processor(bNode *node, CompBuf *out, CompBuf *src_buf, float *src_col,
339                                                                           CompBuf *fac_buf, float *fac, void (*func)(bNode *, float *, float *, float *), 
340                                                                           int src_type, int fac_type)
341 {
342         CompBuf *src_use, *fac_use;
343         float *outfp=out->rect, *srcfp, *facfp;
344         int xrad, yrad, x, y;
345         
346         src_use= typecheck_compbuf(src_buf, src_type);
347         fac_use= typecheck_compbuf(fac_buf, fac_type);
348
349         xrad= out->xrad;
350         yrad= out->yrad;
351         
352         for(y= -yrad; y<-yrad+out->y; y++) {
353                 for(x= -xrad; x<-xrad+out->x; x++, outfp+=out->type) {
354                         srcfp= compbuf_get_pixel(src_use, src_col, x, y, xrad, yrad);
355                         facfp= compbuf_get_pixel(fac_use, fac, x, y, xrad, yrad);
356                         
357                         func(node, outfp, srcfp, facfp);
358                 }
359         }
360         if(src_use!=src_buf)
361                 free_compbuf(src_use);
362         if(fac_use!=fac_buf)
363                 free_compbuf(fac_use);
364 }
365
366 /* Pixel-to-Pixel operation, 3 Images in, 1 out */
367 void composit3_pixel_processor(bNode *node, CompBuf *out, CompBuf *src1_buf, float *src1_col, CompBuf *src2_buf, float *src2_col, 
368                                                                           CompBuf *fac_buf, float *fac, void (*func)(bNode *, float *, float *, float *, float *), 
369                                                                           int src1_type, int src2_type, int fac_type)
370 {
371         CompBuf *src1_use, *src2_use, *fac_use;
372         float *outfp=out->rect, *src1fp, *src2fp, *facfp;
373         int xrad, yrad, x, y;
374         
375         src1_use= typecheck_compbuf(src1_buf, src1_type);
376         src2_use= typecheck_compbuf(src2_buf, src2_type);
377         fac_use= typecheck_compbuf(fac_buf, fac_type);
378         
379         xrad= out->xrad;
380         yrad= out->yrad;
381         
382         for(y= -yrad; y<-yrad+out->y; y++) {
383                 for(x= -xrad; x<-xrad+out->x; x++, outfp+=out->type) {
384                         src1fp= compbuf_get_pixel(src1_use, src1_col, x, y, xrad, yrad);
385                         src2fp= compbuf_get_pixel(src2_use, src2_col, x, y, xrad, yrad);
386                         facfp= compbuf_get_pixel(fac_use, fac, x, y, xrad, yrad);
387                         
388                         func(node, outfp, src1fp, src2fp, facfp);
389                 }
390         }
391         
392         if(src1_use!=src1_buf)
393                 free_compbuf(src1_use);
394         if(src2_use!=src2_buf)
395                 free_compbuf(src2_use);
396         if(fac_use!=fac_buf)
397                 free_compbuf(fac_use);
398 }
399
400 /* Pixel-to-Pixel operation, 4 Images in, 1 out */
401 void composit4_pixel_processor(bNode *node, CompBuf *out, CompBuf *src1_buf, float *src1_col, CompBuf *fac1_buf, float *fac1, 
402                                                                           CompBuf *src2_buf, float *src2_col, CompBuf *fac2_buf, float *fac2, 
403                                                                           void (*func)(bNode *, float *, float *, float *, float *, float *), 
404                                                                           int src1_type, int fac1_type, int src2_type, int fac2_type)
405 {
406         CompBuf *src1_use, *src2_use, *fac1_use, *fac2_use;
407         float *outfp=out->rect, *src1fp, *src2fp, *fac1fp, *fac2fp;
408         int xrad, yrad, x, y;
409         
410         src1_use= typecheck_compbuf(src1_buf, src1_type);
411         src2_use= typecheck_compbuf(src2_buf, src2_type);
412         fac1_use= typecheck_compbuf(fac1_buf, fac1_type);
413         fac2_use= typecheck_compbuf(fac2_buf, fac2_type);
414         
415         xrad= out->xrad;
416         yrad= out->yrad;
417         
418         for(y= -yrad; y<-yrad+out->y; y++) {
419                 for(x= -xrad; x<-xrad+out->x; x++, outfp+=out->type) {
420                         src1fp= compbuf_get_pixel(src1_use, src1_col, x, y, xrad, yrad);
421                         src2fp= compbuf_get_pixel(src2_use, src2_col, x, y, xrad, yrad);
422                         fac1fp= compbuf_get_pixel(fac1_use, fac1, x, y, xrad, yrad);
423                         fac2fp= compbuf_get_pixel(fac2_use, fac2, x, y, xrad, yrad);
424                         
425                         func(node, outfp, src1fp, fac1fp, src2fp, fac2fp);
426                 }
427         }
428         
429         if(src1_use!=src1_buf)
430                 free_compbuf(src1_use);
431         if(src2_use!=src2_buf)
432                 free_compbuf(src2_use);
433         if(fac1_use!=fac1_buf)
434                 free_compbuf(fac1_use);
435         if(fac2_use!=fac2_buf)
436                 free_compbuf(fac2_use);
437 }
438
439
440 CompBuf *valbuf_from_rgbabuf(CompBuf *cbuf, int channel)
441 {
442         CompBuf *valbuf= alloc_compbuf(cbuf->x, cbuf->y, CB_VAL, 1);
443         float *valf, *rectf;
444         int tot;
445         
446         /* warning note: xof and yof are applied in pixelprocessor, but should be copied otherwise? */
447         valbuf->xof= cbuf->xof;
448         valbuf->yof= cbuf->yof;
449         
450         valf= valbuf->rect;
451         
452         /* defaults to returning alpha channel */
453         if ((channel < CHAN_R) && (channel > CHAN_A)) channel = CHAN_A;
454
455         rectf= cbuf->rect + channel;
456         
457         for(tot= cbuf->x*cbuf->y; tot>0; tot--, valf++, rectf+=4)
458                 *valf= *rectf;
459         
460         return valbuf;
461 }
462
463 void generate_preview(bNode *node, CompBuf *stackbuf)
464 {
465         bNodePreview *preview= node->preview;
466         
467         if(preview && stackbuf) {
468                 CompBuf *cbuf, *stackbuf_use;
469                 
470                 if(stackbuf->rect==NULL) return;
471                 
472                 stackbuf_use= typecheck_compbuf(stackbuf, CB_RGBA);
473                 
474                 if(stackbuf->x > stackbuf->y) {
475                         preview->xsize= 140;
476                         preview->ysize= (140*stackbuf->y)/stackbuf->x;
477                 }
478                 else {
479                         preview->ysize= 140;
480                         preview->xsize= (140*stackbuf->x)/stackbuf->y;
481                 }
482                 
483                 cbuf= scalefast_compbuf(stackbuf_use, preview->xsize, preview->ysize);
484                 
485                 /* this ensures free-compbuf does the right stuff */
486                 SWAP(float *, cbuf->rect, node->preview->rect);
487                 
488                 free_compbuf(cbuf);
489                 if(stackbuf_use!=stackbuf)
490                         free_compbuf(stackbuf_use);
491
492         }
493 }
494
495 void do_rgba_to_yuva(bNode *node, float *out, float *in)
496 {
497    rgb_to_yuv(in[0],in[1],in[2], &out[0], &out[1], &out[2]);
498    out[3]=in[3];
499 }
500
501 void do_rgba_to_hsva(bNode *node, float *out, float *in)
502 {
503    rgb_to_hsv(in[0],in[1],in[2], &out[0], &out[1], &out[2]);
504    out[3]=in[3];
505 }
506
507 void do_rgba_to_ycca(bNode *node, float *out, float *in)
508 {
509    rgb_to_ycc(in[0],in[1],in[2], &out[0], &out[1], &out[2]);
510    out[3]=in[3];
511 }
512
513 void do_yuva_to_rgba(bNode *node, float *out, float *in)
514 {
515    yuv_to_rgb(in[0],in[1],in[2], &out[0], &out[1], &out[2]);
516    out[3]=in[3];
517 }
518
519 void do_hsva_to_rgba(bNode *node, float *out, float *in)
520 {
521    hsv_to_rgb(in[0],in[1],in[2], &out[0], &out[1], &out[2]);
522    out[3]=in[3];
523 }
524
525 void do_ycca_to_rgba(bNode *node, float *out, float *in)
526 {
527    ycc_to_rgb(in[0],in[1],in[2], &out[0], &out[1], &out[2]);
528    out[3]=in[3];
529 }
530
531 void do_copy_rgba(bNode *node, float *out, float *in)
532 {
533    QUATCOPY(out, in);
534 }
535
536 void do_copy_rgb(bNode *node, float *out, float *in)
537 {
538    VECCOPY(out, in);
539    out[3]= 1.0f;
540 }
541
542 void do_copy_value(bNode *node, float *out, float *in)
543 {
544    out[0]= in[0];
545 }
546
547 void do_copy_a_rgba(bNode *node, float *out, float *in, float *fac)
548 {
549    VECCOPY(out, in);
550    out[3]= *fac;
551 }
552
553 /* only accepts RGBA buffers */
554 void gamma_correct_compbuf(CompBuf *img, int inversed)
555 {
556    float *drect;
557    int x;
558
559    if(img->type!=CB_RGBA) return;
560
561    drect= img->rect;
562    if(inversed) {
563       for(x=img->x*img->y; x>0; x--, drect+=4) {
564          if(drect[0]>0.0f) drect[0]= sqrt(drect[0]); else drect[0]= 0.0f;
565          if(drect[1]>0.0f) drect[1]= sqrt(drect[1]); else drect[1]= 0.0f;
566          if(drect[2]>0.0f) drect[2]= sqrt(drect[2]); else drect[2]= 0.0f;
567       }
568    }
569    else {
570       for(x=img->x*img->y; x>0; x--, drect+=4) {
571          if(drect[0]>0.0f) drect[0]*= drect[0]; else drect[0]= 0.0f;
572          if(drect[1]>0.0f) drect[1]*= drect[1]; else drect[1]= 0.0f;
573          if(drect[2]>0.0f) drect[2]*= drect[2]; else drect[2]= 0.0f;
574       }
575    }
576 }
577
578
579
580
581 /*
582  *  2D Fast Hartley Transform, used for convolution
583  */
584
585 typedef float fREAL;
586
587 // returns next highest power of 2 of x, as well it's log2 in L2
588 static unsigned int nextPow2(unsigned int x, unsigned int* L2)
589 {
590         unsigned int pw, x_notpow2 = x & (x-1);
591         *L2 = 0;
592         while (x>>=1) ++(*L2);
593         pw = 1 << (*L2);
594         if (x_notpow2) { (*L2)++;  pw<<=1; }
595         return pw;
596 }
597
598 //------------------------------------------------------------------------------
599
600 // from FXT library by Joerg Arndt, faster in order bitreversal
601 // use: r = revbin_upd(r, h) where h = N>>1
602 static unsigned int revbin_upd(unsigned int r, unsigned int h)
603 {
604         while (!((r^=h)&h)) h >>= 1;
605         return r;
606 }
607 //------------------------------------------------------------------------------
608 static void FHT(fREAL* data, unsigned int M, unsigned int inverse)
609 {
610         double tt, fc, dc, fs, ds, a = M_PI;
611         fREAL t1, t2;
612         int n2, bd, bl, istep, k, len = 1 << M, n = 1;
613
614         int i, j = 0;
615         unsigned int Nh = len >> 1;
616         for (i=1;i<(len-1);++i) {
617                 j = revbin_upd(j, Nh);
618                 if (j>i) {
619                         t1 = data[i];
620                         data[i] = data[j];
621                         data[j] = t1;
622                 }
623         }
624
625         do {
626                 fREAL* data_n = &data[n];
627
628                 istep = n << 1;
629                 for (k=0; k<len; k+=istep) {
630                         t1 = data_n[k];
631                         data_n[k] = data[k] - t1;
632                         data[k] += t1;
633                 }
634
635                 n2 = n >> 1;
636                 if (n>2) {
637                         fc = dc = cos(a);
638                         fs = ds = sqrt(1.0 - fc*fc); //sin(a);
639                         bd = n-2;
640                         for (bl=1; bl<n2; bl++) {
641                                 fREAL* data_nbd = &data_n[bd];
642                                 fREAL* data_bd = &data[bd];
643                                 for (k=bl; k<len; k+=istep) {
644                                         t1 = fc*data_n[k] + fs*data_nbd[k];
645                                         t2 = fs*data_n[k] - fc*data_nbd[k];
646                                         data_n[k] = data[k] - t1;
647                                         data_nbd[k] = data_bd[k] - t2;
648                                         data[k] += t1;
649                                         data_bd[k] += t2;
650                                 }
651                                 tt = fc*dc - fs*ds;
652                                 fs = fs*dc + fc*ds;
653                                 fc = tt;
654                                 bd -= 2;
655                         }
656                 }
657
658                 if (n>1) {
659                         for (k=n2; k<len; k+=istep) {
660                                 t1 = data_n[k];
661                                 data_n[k] = data[k] - t1;
662                                 data[k] += t1;
663                         }
664                 }
665
666                 n = istep;
667                 a *= 0.5;
668         } while (n<len);
669
670         if (inverse) {
671                 fREAL sc = (fREAL)1 / (fREAL)len;
672                 for (k=0; k<len; ++k)
673                         data[k] *= sc;
674         }
675 }
676 //------------------------------------------------------------------------------
677 /* 2D Fast Hartley Transform, Mx/My -> log2 of width/height,
678         nzp -> the row where zero pad data starts,
679         inverse -> see above */
680 static void FHT2D(fREAL *data, unsigned int Mx, unsigned int My,
681                 unsigned int nzp, unsigned int inverse)
682 {
683         unsigned int i, j, Nx, Ny, maxy;
684         fREAL t;
685
686         Nx = 1 << Mx;
687         Ny = 1 << My;
688
689         // rows (forward transform skips 0 pad data)
690         maxy = inverse ? Ny : nzp;
691         for (j=0; j<maxy; ++j)
692                 FHT(&data[Nx*j], Mx, inverse);
693
694         // transpose data
695         if (Nx==Ny) {  // square
696                 for (j=0; j<Ny; ++j)
697                         for (i=j+1; i<Nx; ++i) {
698                                 unsigned int op = i + (j << Mx), np = j + (i << My);
699                                 t=data[op], data[op]=data[np], data[np]=t;
700                         }
701         }
702         else {  // rectangular
703                 unsigned int k, Nym = Ny-1, stm = 1 << (Mx + My);
704                 for (i=0; stm>0; i++) {
705                         #define pred(k) (((k & Nym) << Mx) + (k >> My))
706                         for (j=pred(i); j>i; j=pred(j));
707                         if (j < i) continue;
708                         for (k=i, j=pred(i); j!=i; k=j, j=pred(j), stm--)
709                                 { t=data[j], data[j]=data[k], data[k]=t; }
710                         #undef pred
711                         stm--;
712                 }
713         }
714         // swap Mx/My & Nx/Ny
715         i = Nx, Nx = Ny, Ny = i;
716         i = Mx, Mx = My, My = i;
717
718         // now columns == transposed rows
719         for (j=0; j<Ny; ++j)
720                 FHT(&data[Nx*j], Mx, inverse);
721
722         // finalize
723         for (j=0; j<=(Ny >> 1); j++) {
724                 unsigned int jm = (Ny - j) & (Ny-1);
725                 unsigned int ji = j << Mx;
726                 unsigned int jmi = jm << Mx;
727                 for (i=0; i<=(Nx >> 1); i++) {
728                         unsigned int im = (Nx - i) & (Nx-1);
729                         fREAL A = data[ji + i];
730                         fREAL B = data[jmi + i];
731                         fREAL C = data[ji + im];
732                         fREAL D = data[jmi + im];
733                         fREAL E = (fREAL)0.5*((A + D) - (B + C));
734                         data[ji + i] = A - E;
735                         data[jmi + i] = B + E;
736                         data[ji + im] = C + E;
737                         data[jmi + im] = D - E;
738                 }
739         }
740
741 }
742
743 //------------------------------------------------------------------------------
744
745 /* 2D convolution calc, d1 *= d2, M/N - > log2 of width/height */
746 static void fht_convolve(fREAL* d1, fREAL* d2, unsigned int M, unsigned int N)
747 {
748         fREAL a, b;
749         unsigned int i, j, k, L, mj, mL;
750         unsigned int m = 1 << M, n = 1 << N;
751         unsigned int m2 = 1 << (M-1), n2 = 1 << (N-1);
752         unsigned int mn2 = m << (N-1);
753
754         d1[0] *= d2[0];
755         d1[mn2] *= d2[mn2];
756         d1[m2] *= d2[m2];
757         d1[m2 + mn2] *= d2[m2 + mn2];
758         for (i=1; i<m2; i++) {
759                 k = m - i;
760                 a = d1[i]*d2[i] - d1[k]*d2[k];
761                 b = d1[k]*d2[i] + d1[i]*d2[k];
762                 d1[i] = (b + a)*(fREAL)0.5;
763                 d1[k] = (b - a)*(fREAL)0.5;
764                 a = d1[i + mn2]*d2[i + mn2] - d1[k + mn2]*d2[k + mn2];
765                 b = d1[k + mn2]*d2[i + mn2] + d1[i + mn2]*d2[k + mn2];
766                 d1[i + mn2] = (b + a)*(fREAL)0.5;
767                 d1[k + mn2] = (b - a)*(fREAL)0.5;
768         }
769         for (j=1; j<n2; j++) {
770                 L = n - j;
771                 mj = j << M;
772                 mL = L << M;
773                 a = d1[mj]*d2[mj] - d1[mL]*d2[mL];
774                 b = d1[mL]*d2[mj] + d1[mj]*d2[mL];
775                 d1[mj] = (b + a)*(fREAL)0.5;
776                 d1[mL] = (b - a)*(fREAL)0.5;
777                 a = d1[m2 + mj]*d2[m2 + mj] - d1[m2 + mL]*d2[m2 + mL];
778                 b = d1[m2 + mL]*d2[m2 + mj] + d1[m2 + mj]*d2[m2 + mL];
779                 d1[m2 + mj] = (b + a)*(fREAL)0.5;
780                 d1[m2 + mL] = (b - a)*(fREAL)0.5;
781         }
782         for (i=1; i<m2; i++) {
783                 k = m - i;
784                 for (j=1; j<n2; j++) {
785                         L = n - j;
786                         mj = j << M;
787                         mL = L << M;
788                         a = d1[i + mj]*d2[i + mj] - d1[k + mL]*d2[k + mL];
789                         b = d1[k + mL]*d2[i + mj] + d1[i + mj]*d2[k + mL];
790                         d1[i + mj] = (b + a)*(fREAL)0.5;
791                         d1[k + mL] = (b - a)*(fREAL)0.5;
792                         a = d1[i + mL]*d2[i + mL] - d1[k + mj]*d2[k + mj];
793                         b = d1[k + mj]*d2[i + mL] + d1[i + mL]*d2[k + mj];
794                         d1[i + mL] = (b + a)*(fREAL)0.5;
795                         d1[k + mj] = (b - a)*(fREAL)0.5;
796                 }
797         }
798 }
799
800 //------------------------------------------------------------------------------
801
802 void convolve(CompBuf* dst, CompBuf* in1, CompBuf* in2)
803 {
804         fREAL *data1, *data2, *fp;
805         unsigned int w2, h2, hw, hh, log2_w, log2_h;
806         fRGB wt, *colp;
807         int x, y, ch;
808         int xbl, ybl, nxb, nyb, xbsz, ybsz;
809         int in2done = 0;
810
811         CompBuf* rdst = alloc_compbuf(in1->x, in1->y, in1->type, 1);
812
813         // convolution result width & height
814         w2 = 2*in2->x - 1;
815         h2 = 2*in2->y - 1;
816         // FFT pow2 required size & log2
817         w2 = nextPow2(w2, &log2_w);
818         h2 = nextPow2(h2, &log2_h);
819
820         // alloc space
821         data1 = (fREAL*)MEM_callocN(3*w2*h2*sizeof(fREAL), "convolve_fast FHT data1");
822         data2 = (fREAL*)MEM_callocN(w2*h2*sizeof(fREAL), "convolve_fast FHT data2");
823
824         // normalize convolutor
825         wt[0] = wt[1] = wt[2] = 0.f;
826         for (y=0; y<in2->y; y++) {
827                 colp = (fRGB*)&in2->rect[y*in2->x*in2->type];
828                 for (x=0; x<in2->x; x++)
829                         fRGB_add(wt, colp[x]);
830         }
831         if (wt[0] != 0.f) wt[0] = 1.f/wt[0];
832         if (wt[1] != 0.f) wt[1] = 1.f/wt[1];
833         if (wt[2] != 0.f) wt[2] = 1.f/wt[2];
834         for (y=0; y<in2->y; y++) {
835                 colp = (fRGB*)&in2->rect[y*in2->x*in2->type];
836                 for (x=0; x<in2->x; x++)
837                         fRGB_colormult(colp[x], wt);
838         }
839
840         // copy image data, unpacking interleaved RGBA into separate channels
841         // only need to calc data1 once
842
843         // block add-overlap
844         hw = in2->x >> 1;
845         hh = in2->y >> 1;
846         xbsz = (w2 + 1) - in2->x;
847         ybsz = (h2 + 1) - in2->y;
848         nxb = in1->x / xbsz;
849         if (in1->x % xbsz) nxb++;
850         nyb = in1->y / ybsz;
851         if (in1->y % ybsz) nyb++;
852         for (ybl=0; ybl<nyb; ybl++) {
853                 for (xbl=0; xbl<nxb; xbl++) {
854
855                         // each channel one by one
856                         for (ch=0; ch<3; ch++) {
857                                 fREAL* data1ch = &data1[ch*w2*h2];
858
859                                 // only need to calc fht data from in2 once, can re-use for every block
860                                 if (!in2done) {
861                                         // in2, channel ch -> data1
862                                         for (y=0; y<in2->y; y++) {
863                                                 fp = &data1ch[y*w2];
864                                                 colp = (fRGB*)&in2->rect[y*in2->x*in2->type];
865                                                 for (x=0; x<in2->x; x++)
866                                                         fp[x] = colp[x][ch];
867                                         }
868                                 }
869
870                                 // in1, channel ch -> data2
871                                 memset(data2, 0, w2*h2*sizeof(fREAL));
872                                 for (y=0; y<ybsz; y++) {
873                                         int yy = ybl*ybsz + y;
874                                         if (yy >= in1->y) continue;
875                                         fp = &data2[y*w2];
876                                         colp = (fRGB*)&in1->rect[yy*in1->x*in1->type];
877                                         for (x=0; x<xbsz; x++) {
878                                                 int xx = xbl*xbsz + x;
879                                                 if (xx >= in1->x) continue;
880                                                 fp[x] = colp[xx][ch];
881                                         }
882                                 }
883
884                                 // forward FHT
885                                 // zero pad data start is different for each == height+1
886                                 if (!in2done) FHT2D(data1ch, log2_w, log2_h, in2->y+1, 0);
887                                 FHT2D(data2, log2_w, log2_h, in2->y+1, 0);
888
889                                 // FHT2D transposed data, row/col now swapped
890                                 // convolve & inverse FHT
891                                 fht_convolve(data2, data1ch, log2_h, log2_w);
892                                 FHT2D(data2, log2_h, log2_w, 0, 1);
893                                 // data again transposed, so in order again
894
895                                 // overlap-add result
896                                 for (y=0; y<(int)h2; y++) {
897                                         const int yy = ybl*ybsz + y - hh;
898                                         if ((yy < 0) || (yy >= in1->y)) continue;
899                                         fp = &data2[y*w2];
900                                         colp = (fRGB*)&rdst->rect[yy*in1->x*in1->type];
901                                         for (x=0; x<(int)w2; x++) {
902                                                 const int xx = xbl*xbsz + x - hw;
903                                                 if ((xx < 0) || (xx >= in1->x)) continue;
904                                                 colp[xx][ch] += fp[x];
905                                         }
906                                 }
907
908                         }
909                         in2done = 1;
910                 }
911         }
912
913         MEM_freeN(data2);
914         MEM_freeN(data1);
915         memcpy(dst->rect, rdst->rect, sizeof(float)*dst->x*dst->y*dst->type);
916         free_compbuf(rdst);
917 }
918
919
920 /*
921  *
922  * Utility functions qd_* should probably be intergrated better with other functions here.
923  *
924  */
925 // sets fcol to pixelcolor at (x, y)
926 void qd_getPixel(CompBuf* src, int x, int y, float* col)
927 {
928         if ((x >= 0) && (x < src->x) && (y >= 0) && (y < src->y)) {
929                 float* bc = &src->rect[(x + y*src->x)*src->type];
930                 col[0] = bc[0], col[1] = bc[1], col[2] = bc[2];
931         }
932         else col[0] = col[1] = col[2] = 0.f;
933 }
934
935 // sets pixel (x, y) to color col
936 void qd_setPixel(CompBuf* src, int x, int y, float* col)
937 {
938         if ((x >= 0) && (x < src->x) && (y >= 0) && (y < src->y)) {
939                 float* bc = &src->rect[(x + y*src->x)*src->type];
940                 bc[0] = col[0], bc[1] = col[1], bc[2] = col[2];
941         }
942 }
943
944 // adds fcol to pixelcolor (x, y)
945 void qd_addPixel(CompBuf* src, int x, int y, float* col)
946 {
947         if ((x >= 0) && (x < src->x) && (y >= 0) && (y < src->y)) {
948                 float* bc = &src->rect[(x + y*src->x)*src->type];
949                 bc[0] += col[0], bc[1] += col[1], bc[2] += col[2];
950         }
951 }
952
953 // multiplies pixel by factor value f
954 void qd_multPixel(CompBuf* src, int x, int y, float f)
955 {
956         if ((x >= 0) && (x < src->x) && (y >= 0) && (y < src->y)) {
957                 float* bc = &src->rect[(x + y*src->x)*src->type];
958                 bc[0] *= f, bc[1] *= f, bc[2] *= f;
959         }
960 }
961
962 // bilinear interpolation with wraparound
963 void qd_getPixelLerpWrap(CompBuf* src, float u, float v, float* col)
964 {
965         const float ufl = floor(u), vfl = floor(v);
966         const int nx = (int)ufl % src->x, ny = (int)vfl % src->y;
967         const int x1 = (nx < 0) ? (nx + src->x) : nx;
968         const int y1 = (ny < 0) ? (ny + src->y) : ny;
969         const int x2 = (x1 + 1) % src->x, y2 = (y1 + 1) % src->y;
970         const float* c00 = &src->rect[(x1 + y1*src->x)*src->type];
971         const float* c10 = &src->rect[(x2 + y1*src->x)*src->type];
972         const float* c01 = &src->rect[(x1 + y2*src->x)*src->type];
973         const float* c11 = &src->rect[(x2 + y2*src->x)*src->type];
974         const float uf = u - ufl, vf = v - vfl;
975         const float w00=(1.f-uf)*(1.f-vf), w10=uf*(1.f-vf), w01=(1.f-uf)*vf, w11=uf*vf;
976         col[0] = w00*c00[0] + w10*c10[0] + w01*c01[0] + w11*c11[0];
977         if (src->type != CB_VAL) {
978                 col[1] = w00*c00[1] + w10*c10[1] + w01*c01[1] + w11*c11[1];
979                 col[2] = w00*c00[2] + w10*c10[2] + w01*c01[2] + w11*c11[2];
980                 col[3] = w00*c00[3] + w10*c10[3] + w01*c01[3] + w11*c11[3];
981         }
982 }
983
984 // as above, without wrap around
985 void qd_getPixelLerp(CompBuf* src, float u, float v, float* col)
986 {
987         const float ufl = floor(u), vfl = floor(v);
988         const int x1 = (int)ufl, y1 = (int)vfl;
989         const int x2 = (int)ceil(u), y2 = (int)ceil(v);
990         if ((x2 >= 0) && (y2 >= 0) && (x1 < src->x) && (y1 < src->y)) {
991                 const float B[4] = {0,0,0,0};
992                 const int ox1 = (x1 < 0), oy1 = (y1 < 0), ox2 = (x2 >= src->x), oy2 = (y2 >= src->y);
993                 const float* c00 = (ox1 || oy1) ? B : &src->rect[(x1 + y1*src->x)*src->type];
994                 const float* c10 = (ox2 || oy1) ? B : &src->rect[(x2 + y1*src->x)*src->type];
995                 const float* c01 = (ox1 || oy2) ? B : &src->rect[(x1 + y2*src->x)*src->type];
996                 const float* c11 = (ox2 || oy2) ? B : &src->rect[(x2 + y2*src->x)*src->type];
997                 const float uf = u - ufl, vf = v - vfl;
998                 const float w00=(1.f-uf)*(1.f-vf), w10=uf*(1.f-vf), w01=(1.f-uf)*vf, w11=uf*vf;
999                 col[0] = w00*c00[0] + w10*c10[0] + w01*c01[0] + w11*c11[0];
1000                 if (src->type != CB_VAL) {
1001                         col[1] = w00*c00[1] + w10*c10[1] + w01*c01[1] + w11*c11[1];
1002                         col[2] = w00*c00[2] + w10*c10[2] + w01*c01[2] + w11*c11[2];
1003                         col[3] = w00*c00[3] + w10*c10[3] + w01*c01[3] + w11*c11[3];
1004                 }
1005         }
1006         else col[0] = col[1] = col[2] = col[3] = 0.f;
1007 }
1008
1009 // as above, sampling only one channel
1010 void qd_getPixelLerpChan(CompBuf* src, float u, float v, int chan, float* out)
1011 {
1012         const float ufl = floor(u), vfl = floor(v);
1013         const int x1 = (int)ufl, y1 = (int)vfl;
1014         const int x2 = (int)ceil(u), y2 = (int)ceil(v);
1015         if (chan >= src->type) chan = 0;
1016         if ((x2 >= 0) && (y2 >= 0) && (x1 < src->x) && (y1 < src->y)) {
1017                 const float B[4] = {0,0,0,0};
1018                 const int ox1 = (x1 < 0), oy1 = (y1 < 0), ox2 = (x2 >= src->x), oy2 = (y2 >= src->y);
1019                 const float* c00 = (ox1 || oy1) ? B : &src->rect[(x1 + y1*src->x)*src->type + chan];
1020                 const float* c10 = (ox2 || oy1) ? B : &src->rect[(x2 + y1*src->x)*src->type + chan];
1021                 const float* c01 = (ox1 || oy2) ? B : &src->rect[(x1 + y2*src->x)*src->type + chan];
1022                 const float* c11 = (ox2 || oy2) ? B : &src->rect[(x2 + y2*src->x)*src->type + chan];
1023                 const float uf = u - ufl, vf = v - vfl;
1024                 const float w00=(1.f-uf)*(1.f-vf), w10=uf*(1.f-vf), w01=(1.f-uf)*vf, w11=uf*vf;
1025                 out[0] = w00*c00[0] + w10*c10[0] + w01*c01[0] + w11*c11[0];
1026         }
1027         else *out = 0.f;
1028 }
1029
1030
1031 CompBuf* qd_downScaledCopy(CompBuf* src, int scale)
1032 {
1033         CompBuf* fbuf;
1034         if (scale <= 1)
1035                 fbuf = dupalloc_compbuf(src);
1036         else {
1037                 int nw = src->x/scale, nh = src->y/scale;
1038                 if ((2*(src->x % scale)) > scale) nw++;
1039                 if ((2*(src->y % scale)) > scale) nh++;
1040                 fbuf = alloc_compbuf(nw, nh, src->type, 1);
1041                 {
1042                         int x, y, xx, yy, sx, sy, mx, my;
1043                         float colsum[4];
1044                         float fscale = 1.f/(float)(scale*scale);
1045                         for (y=0; y<nh; y++) {
1046                                 fRGB* fcolp = (fRGB*)&fbuf->rect[y*fbuf->x*fbuf->type];
1047                                 yy = y*scale;
1048                                 my = yy + scale;
1049                                 if (my > src->y) my = src->y;
1050                                 for (x=0; x<nw; x++) {
1051                                         xx = x*scale;
1052                                         mx = xx + scale;
1053                                         if (mx > src->x) mx = src->x;
1054                                         colsum[0] = colsum[1] = colsum[2] = 0.f;
1055                                         for (sy=yy; sy<my; sy++) {
1056                                                 fRGB* scolp = (fRGB*)&src->rect[sy*src->x*src->type];
1057                                                 for (sx=xx; sx<mx; sx++)
1058                                                         fRGB_add(colsum, scolp[sx]);
1059                                         }
1060                                         fRGB_mult(colsum, fscale);
1061                                         fRGB_copy(fcolp[x], colsum);
1062                                 }
1063                         }
1064                 }
1065         }
1066         return fbuf;
1067 }
1068
1069 // fast g.blur, per channel
1070 // xy var. bits 1 & 2 ca be used to blur in x or y direction separately
1071 void IIR_gauss(CompBuf* src, float sigma, int chan, int xy)
1072 {
1073         double q, q2, sc, cf[4], tsM[9], tsu[3], tsv[3];
1074         float *X, *Y, *W;
1075         int i, x, y, sz;
1076
1077         // <0.5 not valid, though can have a possibly useful sort of sharpening effect
1078         if (sigma < 0.5) return;
1079         
1080         if ((xy < 1) || (xy > 3)) xy = 3;
1081         
1082         // see "Recursive Gabor Filtering" by Young/VanVliet
1083         // all factors here in double.prec. Required, because for single.prec it seems to blow up if sigma > ~200
1084         if (sigma >= 3.556)
1085                 q = 0.9804*(sigma - 3.556) + 2.5091;
1086         else // sigma >= 0.5
1087                 q = (0.0561*sigma + 0.5784)*sigma - 0.2568;
1088         q2 = q*q;
1089         sc = (1.1668 + q)*(3.203729649  + (2.21566 + q)*q);
1090         // no gabor filtering here, so no complex multiplies, just the regular coefs.
1091         // all negated here, so as not to have to recalc Triggs/Sdika matrix
1092         cf[1] = q*(5.788961737 + (6.76492 + 3.0*q)*q)/ sc;
1093         cf[2] = -q2*(3.38246 + 3.0*q)/sc;
1094         // 0 & 3 unchanged
1095         cf[3] = q2*q/sc;
1096         cf[0] = 1.0 - cf[1] - cf[2] - cf[3];
1097
1098         // Triggs/Sdika border corrections,
1099         // it seems to work, not entirely sure if it is actually totally correct,
1100         // Besides J.M.Geusebroek's anigauss.c (see http://www.science.uva.nl/~mark),
1101         // found one other implementation by Cristoph Lampert,
1102         // but neither seem to be quite the same, result seems to be ok sofar anyway.
1103         // Extra scale factor here to not have to do it in filter,
1104         // though maybe this had something to with the precision errors
1105         sc = cf[0]/((1.0 + cf[1] - cf[2] + cf[3])*(1.0 - cf[1] - cf[2] - cf[3])*(1.0 + cf[2] + (cf[1] - cf[3])*cf[3]));
1106         tsM[0] = sc*(-cf[3]*cf[1] + 1.0 - cf[3]*cf[3] - cf[2]);
1107         tsM[1] = sc*((cf[3] + cf[1])*(cf[2] + cf[3]*cf[1]));
1108         tsM[2] = sc*(cf[3]*(cf[1] + cf[3]*cf[2]));
1109         tsM[3] = sc*(cf[1] + cf[3]*cf[2]);
1110         tsM[4] = sc*(-(cf[2] - 1.0)*(cf[2] + cf[3]*cf[1]));
1111         tsM[5] = sc*(-(cf[3]*cf[1] + cf[3]*cf[3] + cf[2] - 1.0)*cf[3]);
1112         tsM[6] = sc*(cf[3]*cf[1] + cf[2] + cf[1]*cf[1] - cf[2]*cf[2]);
1113         tsM[7] = sc*(cf[1]*cf[2] + cf[3]*cf[2]*cf[2] - cf[1]*cf[3]*cf[3] - cf[3]*cf[3]*cf[3] - cf[3]*cf[2] + cf[3]);
1114         tsM[8] = sc*(cf[3]*(cf[1] + cf[3]*cf[2]));
1115
1116 #define YVV(L)\
1117 {\
1118         W[0] = cf[0]*X[0] + cf[1]*X[0] + cf[2]*X[0] + cf[3]*X[0];\
1119         W[1] = cf[0]*X[1] + cf[1]*W[0] + cf[2]*X[0] + cf[3]*X[0];\
1120         W[2] = cf[0]*X[2] + cf[1]*W[1] + cf[2]*W[0] + cf[3]*X[0];\
1121         for (i=3; i<L; i++)\
1122                 W[i] = cf[0]*X[i] + cf[1]*W[i-1] + cf[2]*W[i-2] + cf[3]*W[i-3];\
1123         tsu[0] = W[L-1] - X[L-1];\
1124         tsu[1] = W[L-2] - X[L-1];\
1125         tsu[2] = W[L-3] - X[L-1];\
1126         tsv[0] = tsM[0]*tsu[0] + tsM[1]*tsu[1] + tsM[2]*tsu[2] + X[L-1];\
1127         tsv[1] = tsM[3]*tsu[0] + tsM[4]*tsu[1] + tsM[5]*tsu[2] + X[L-1];\
1128         tsv[2] = tsM[6]*tsu[0] + tsM[7]*tsu[1] + tsM[8]*tsu[2] + X[L-1];\
1129         Y[L-1] = cf[0]*W[L-1] + cf[1]*tsv[0] + cf[2]*tsv[1] + cf[3]*tsv[2];\
1130         Y[L-2] = cf[0]*W[L-2] + cf[1]*Y[L-1] + cf[2]*tsv[0] + cf[3]*tsv[1];\
1131         Y[L-3] = cf[0]*W[L-3] + cf[1]*Y[L-2] + cf[2]*Y[L-1] + cf[3]*tsv[0];\
1132         for (i=L-4; i>=0; i--)\
1133                 Y[i] = cf[0]*W[i] + cf[1]*Y[i+1] + cf[2]*Y[i+2] + cf[3]*Y[i+3];\
1134 }
1135
1136         // intermediate buffers
1137         sz = MAX2(src->x, src->y);
1138         X = MEM_callocN(sz*sizeof(float), "IIR_gauss X buf");
1139         Y = MEM_callocN(sz*sizeof(float), "IIR_gauss Y buf");
1140         W = MEM_callocN(sz*sizeof(float), "IIR_gauss W buf");
1141         if (xy & 1) {   // H
1142                 for (y=0; y<src->y; ++y) {
1143                         const int yx = y*src->x;
1144                         for (x=0; x<src->x; ++x)
1145                                 X[x] = src->rect[(x + yx)*src->type + chan];
1146                         YVV(src->x);
1147                         for (x=0; x<src->x; ++x)
1148                                 src->rect[(x + yx)*src->type + chan] = Y[x];
1149                 }
1150         }
1151         if (xy & 2) {   // V
1152                 for (x=0; x<src->x; ++x) {
1153                         for (y=0; y<src->y; ++y)
1154                                 X[y] = src->rect[(x + y*src->x)*src->type + chan];
1155                         YVV(src->y);
1156                         for (y=0; y<src->y; ++y)
1157                                 src->rect[(x + y*src->x)*src->type + chan] = Y[y];
1158                 }
1159         }
1160
1161         MEM_freeN(X);
1162         MEM_freeN(W);
1163         MEM_freeN(Y);
1164 #undef YVV
1165 }
1166