speedup for fast gauss blue (approx 10% - 15%)
[blender.git] / source / blender / compositor / operations / COM_FastGaussianBlurOperation.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  *
18  * Contributor: 
19  *              Jeroen Bakker 
20  *              Monique Dewanchand
21  */
22
23 #include <limits.h>
24
25 #include "COM_FastGaussianBlurOperation.h"
26 #include "MEM_guardedalloc.h"
27 #include "BLI_utildefines.h"
28
29 FastGaussianBlurOperation::FastGaussianBlurOperation() : BlurBaseOperation()
30 {
31         this->iirgaus = NULL;
32 }
33
34 void FastGaussianBlurOperation::executePixel(float *color, int x, int y, MemoryBuffer *inputBuffers[], void *data)
35 {
36         MemoryBuffer *newData = (MemoryBuffer *)data;
37         newData->read(color, x, y);     
38 }
39
40 bool FastGaussianBlurOperation::determineDependingAreaOfInterest(rcti *input, ReadBufferOperation *readOperation, rcti *output)
41 {
42         rcti newInput;
43         rcti sizeInput;
44         sizeInput.xmin = 0;
45         sizeInput.ymin = 0;
46         sizeInput.xmax = 5;
47         sizeInput.ymax = 5;
48         
49         NodeOperation *operation = this->getInputOperation(1);
50         if (operation->determineDependingAreaOfInterest(&sizeInput, readOperation, output)) {
51                 return true;
52         }
53         else {
54                 if (this->iirgaus) {
55                         return false;
56                 }
57                 else {
58                         newInput.xmin = 0;
59                         newInput.ymin = 0;
60                         newInput.xmax = this->getWidth();
61                         newInput.ymax = this->getHeight();
62                 }
63                 return NodeOperation::determineDependingAreaOfInterest(&newInput, readOperation, output);
64         }
65 }
66
67 void FastGaussianBlurOperation::initExecution()
68 {
69         BlurBaseOperation::initExecution();
70         BlurBaseOperation::initMutex();
71 }
72
73 void FastGaussianBlurOperation::deinitExecution() 
74 {
75         if (this->iirgaus) {
76                 delete this->iirgaus;
77                 this->iirgaus = NULL;
78         }
79         BlurBaseOperation::deinitMutex();
80 }
81
82 void *FastGaussianBlurOperation::initializeTileData(rcti *rect, MemoryBuffer **memoryBuffers)
83 {
84         lockMutex();
85         if (!iirgaus) {
86                 MemoryBuffer *newBuf = (MemoryBuffer *)this->inputProgram->initializeTileData(rect, memoryBuffers);
87                 MemoryBuffer *copy = newBuf->duplicate();
88                 updateSize(memoryBuffers);
89
90                 int c;
91                 sx = data->sizex * this->size / 2.0f;
92                 sy = data->sizey * this->size / 2.0f;
93                 
94                 if ((sx == sy) && (sx > 0.f)) {
95                         for (c = 0; c < COM_NUMBER_OF_CHANNELS; ++c)
96                                 IIR_gauss(copy, sx, c, 3);
97                 }
98                 else {
99                         if (sx > 0.f) {
100                                 for (c = 0; c < COM_NUMBER_OF_CHANNELS; ++c)
101                                         IIR_gauss(copy, sx, c, 1);
102                         }
103                         if (sy > 0.f) {
104                                 for (c = 0; c < COM_NUMBER_OF_CHANNELS; ++c)
105                                         IIR_gauss(copy, sy, c, 2);
106                         }
107                 }
108                 this->iirgaus = copy;
109         }
110         unlockMutex();
111         return iirgaus;
112 }
113
114 void FastGaussianBlurOperation::IIR_gauss(MemoryBuffer *src, float sigma, unsigned int chan, unsigned int xy)
115 {
116         double q, q2, sc, cf[4], tsM[9], tsu[3], tsv[3];
117         double *X, *Y, *W;
118         const unsigned int src_width = src->getWidth();
119         const unsigned int src_height = src->getHeight();
120         unsigned int x, y, sz;
121         unsigned int i;
122         float *buffer = src->getBuffer();
123         
124         // <0.5 not valid, though can have a possibly useful sort of sharpening effect
125         if (sigma < 0.5f) return;
126         
127         if ((xy < 1) || (xy > 3)) xy = 3;
128         
129         // XXX The YVV macro defined below explicitly expects sources of at least 3x3 pixels,
130         //     so just skiping blur along faulty direction if src's def is below that limit!
131         if (src_width < 3) xy &= ~(int) 1;
132         if (src_height < 3) xy &= ~(int) 2;
133         if (xy < 1) return;
134         
135         // see "Recursive Gabor Filtering" by Young/VanVliet
136         // all factors here in double.prec. Required, because for single.prec it seems to blow up if sigma > ~200
137         if (sigma >= 3.556f)
138                 q = 0.9804f * (sigma - 3.556f) + 2.5091f;
139         else // sigma >= 0.5
140                 q = (0.0561f * sigma + 0.5784f) * sigma - 0.2568f;
141         q2 = q * q;
142         sc = (1.1668 + q) * (3.203729649  + (2.21566 + q) * q);
143         // no gabor filtering here, so no complex multiplies, just the regular coefs.
144         // all negated here, so as not to have to recalc Triggs/Sdika matrix
145         cf[1] = q * (5.788961737 + (6.76492 + 3.0 * q) * q) / sc;
146         cf[2] = -q2 * (3.38246 + 3.0 * q) / sc;
147         // 0 & 3 unchanged
148         cf[3] = q2 * q / sc;
149         cf[0] = 1.0 - cf[1] - cf[2] - cf[3];
150         
151         // Triggs/Sdika border corrections,
152         // it seems to work, not entirely sure if it is actually totally correct,
153         // Besides J.M.Geusebroek's anigauss.c (see http://www.science.uva.nl/~mark),
154         // found one other implementation by Cristoph Lampert,
155         // but neither seem to be quite the same, result seems to be ok so far anyway.
156         // Extra scale factor here to not have to do it in filter,
157         // though maybe this had something to with the precision errors
158         sc = cf[0] / ((1.0 + cf[1] - cf[2] + cf[3]) * (1.0 - cf[1] - cf[2] - cf[3]) * (1.0 + cf[2] + (cf[1] - cf[3]) * cf[3]));
159         tsM[0] = sc * (-cf[3] * cf[1] + 1.0 - cf[3] * cf[3] - cf[2]);
160         tsM[1] = sc * ((cf[3] + cf[1]) * (cf[2] + cf[3] * cf[1]));
161         tsM[2] = sc * (cf[3] * (cf[1] + cf[3] * cf[2]));
162         tsM[3] = sc * (cf[1] + cf[3] * cf[2]);
163         tsM[4] = sc * (-(cf[2] - 1.0) * (cf[2] + cf[3] * cf[1]));
164         tsM[5] = sc * (-(cf[3] * cf[1] + cf[3] * cf[3] + cf[2] - 1.0) * cf[3]);
165         tsM[6] = sc * (cf[3] * cf[1] + cf[2] + cf[1] * cf[1] - cf[2] * cf[2]);
166         tsM[7] = sc * (cf[1] * cf[2] + cf[3] * cf[2] * cf[2] - cf[1] * cf[3] * cf[3] - cf[3] * cf[3] * cf[3] - cf[3] * cf[2] + cf[3]);
167         tsM[8] = sc * (cf[3] * (cf[1] + cf[3] * cf[2]));
168         
169 #define YVV(L)                                                                          \
170 {                                                                                       \
171         W[0] = cf[0] * X[0] + cf[1] * X[0] + cf[2] * X[0] + cf[3] * X[0];                   \
172         W[1] = cf[0] * X[1] + cf[1] * W[0] + cf[2] * X[0] + cf[3] * X[0];                   \
173         W[2] = cf[0] * X[2] + cf[1] * W[1] + cf[2] * W[0] + cf[3] * X[0];                   \
174         for (i = 3; i < L; i++) {                                                           \
175                 W[i] = cf[0] * X[i] + cf[1] * W[i - 1] + cf[2] * W[i - 2] + cf[3] * W[i - 3];   \
176         }                                                                                   \
177         tsu[0] = W[L - 1] - X[L - 1];                                                       \
178         tsu[1] = W[L - 2] - X[L - 1];                                                       \
179         tsu[2] = W[L - 3] - X[L - 1];                                                       \
180         tsv[0] = tsM[0] * tsu[0] + tsM[1] * tsu[1] + tsM[2] * tsu[2] + X[L - 1];            \
181         tsv[1] = tsM[3] * tsu[0] + tsM[4] * tsu[1] + tsM[5] * tsu[2] + X[L - 1];            \
182         tsv[2] = tsM[6] * tsu[0] + tsM[7] * tsu[1] + tsM[8] * tsu[2] + X[L - 1];            \
183         Y[L - 1] = cf[0] * W[L - 1] + cf[1] * tsv[0] + cf[2] * tsv[1] + cf[3] * tsv[2];     \
184         Y[L - 2] = cf[0] * W[L - 2] + cf[1] * Y[L - 1] + cf[2] * tsv[0] + cf[3] * tsv[1];   \
185         Y[L - 3] = cf[0] * W[L - 3] + cf[1] * Y[L - 2] + cf[2] * Y[L - 1] + cf[3] * tsv[0]; \
186         /* 'i != UINT_MAX' is really 'i >= 0', but necessary for unsigned int wrapping */   \
187         for (i = L - 4; i != UINT_MAX; i--) {                                               \
188                 Y[i] = cf[0] * W[i] + cf[1] * Y[i + 1] + cf[2] * Y[i + 2] + cf[3] * Y[i + 3];   \
189         }                                                                                   \
190 } (void)0
191         
192         // intermediate buffers
193         sz = MAX2(src_width, src_height);
194         X = (double *)MEM_callocN(sz * sizeof(double), "IIR_gauss X buf");
195         Y = (double *)MEM_callocN(sz * sizeof(double), "IIR_gauss Y buf");
196         W = (double *)MEM_callocN(sz * sizeof(double), "IIR_gauss W buf");
197         if (xy & 1) {   // H
198                 for (y = 0; y < src_height; ++y) {
199                         const int yx = y * src_width;
200                         for (x = 0; x < src_width; ++x)
201                                 X[x] = buffer[(x + yx) * COM_NUMBER_OF_CHANNELS + chan];
202                         YVV(src_width);
203                         for (x = 0; x < src_width; ++x)
204                                 buffer[(x + yx) * COM_NUMBER_OF_CHANNELS + chan] = Y[x];
205                 }
206         }
207         if (xy & 2) {   // V
208                 for (x = 0; x < src_width; ++x) {
209                         for (y = 0; y < src_height; ++y)
210                                 X[y] = buffer[(x + y * src_width) * COM_NUMBER_OF_CHANNELS + chan];
211                         YVV(src_height);
212                         for (y = 0; y < src_height; ++y)
213                                 buffer[(x + y * src_width) * COM_NUMBER_OF_CHANNELS + chan] = Y[y];
214                 }
215         }
216         
217         MEM_freeN(X);
218         MEM_freeN(W);
219         MEM_freeN(Y);
220 #undef YVV
221         
222 }