Cycles: CUDA bicubic and tricubic texture interpolation support.
[blender.git] / intern / cycles / kernel / kernels / cpu / kernel_cpu_image.h
1 /*
2  * Copyright 2011-2016 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifndef __KERNEL_CPU_IMAGE_H__
18 #define __KERNEL_CPU_IMAGE_H__
19
20 CCL_NAMESPACE_BEGIN
21
22 template<typename T> struct TextureInterpolator  {
23 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
24         { \
25                 u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
26                 u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
27                 u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
28                 u[3] = (1.0f / 6.0f) * t * t * t; \
29         } (void)0
30
31         static ccl_always_inline float4 read(float4 r)
32         {
33                 return r;
34         }
35
36         static ccl_always_inline float4 read(uchar4 r)
37         {
38                 float f = 1.0f/255.0f;
39                 return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
40         }
41
42         static ccl_always_inline float4 read(uchar r)
43         {
44                 float f = r*(1.0f/255.0f);
45                 return make_float4(f, f, f, 1.0f);
46         }
47
48         static ccl_always_inline float4 read(float r)
49         {
50                 /* TODO(dingto): Optimize this, so interpolation
51                  * happens on float instead of float4 */
52                 return make_float4(r, r, r, 1.0f);
53         }
54
55         static ccl_always_inline float4 read(half4 r)
56         {
57                 return half4_to_float4(r);
58         }
59
60         static ccl_always_inline float4 read(half r)
61         {
62                 float f = half_to_float(r);
63                 return make_float4(f, f, f, 1.0f);
64         }
65
66         static ccl_always_inline int wrap_periodic(int x, int width)
67         {
68                 x %= width;
69                 if(x < 0)
70                         x += width;
71                 return x;
72         }
73
74         static ccl_always_inline int wrap_clamp(int x, int width)
75         {
76                 return clamp(x, 0, width-1);
77         }
78
79         static ccl_always_inline float frac(float x, int *ix)
80         {
81                 int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
82                 *ix = i;
83                 return x - (float)i;
84         }
85
86         static ccl_always_inline float4 interp(const TextureInfo& info, float x, float y)
87         {
88                 if(UNLIKELY(!info.data))
89                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
90
91                 const T *data = (const T*)info.data;
92                 int width = info.width;
93                 int height = info.height;
94                 int ix, iy, nix, niy;
95
96                 if(info.interpolation == INTERPOLATION_CLOSEST) {
97                         frac(x*(float)width, &ix);
98                         frac(y*(float)height, &iy);
99                         switch(info.extension) {
100                                 case EXTENSION_REPEAT:
101                                         ix = wrap_periodic(ix, width);
102                                         iy = wrap_periodic(iy, height);
103                                         break;
104                                 case EXTENSION_CLIP:
105                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
106                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
107                                         }
108                                         ATTR_FALLTHROUGH;
109                                 case EXTENSION_EXTEND:
110                                         ix = wrap_clamp(ix, width);
111                                         iy = wrap_clamp(iy, height);
112                                         break;
113                                 default:
114                                         kernel_assert(0);
115                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
116                         }
117                         return read(data[ix + iy*width]);
118                 }
119                 else if(info.interpolation == INTERPOLATION_LINEAR) {
120                         float tx = frac(x*(float)width - 0.5f, &ix);
121                         float ty = frac(y*(float)height - 0.5f, &iy);
122
123                         switch(info.extension) {
124                                 case EXTENSION_REPEAT:
125                                         ix = wrap_periodic(ix, width);
126                                         iy = wrap_periodic(iy, height);
127
128                                         nix = wrap_periodic(ix+1, width);
129                                         niy = wrap_periodic(iy+1, height);
130                                         break;
131                                 case EXTENSION_CLIP:
132                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
133                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
134                                         }
135                                         ATTR_FALLTHROUGH;
136                                 case EXTENSION_EXTEND:
137                                         nix = wrap_clamp(ix+1, width);
138                                         niy = wrap_clamp(iy+1, height);
139
140                                         ix = wrap_clamp(ix, width);
141                                         iy = wrap_clamp(iy, height);
142                                         break;
143                                 default:
144                                         kernel_assert(0);
145                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
146                         }
147
148                         float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
149                         r += (1.0f - ty)*tx*read(data[nix + iy*width]);
150                         r += ty*(1.0f - tx)*read(data[ix + niy*width]);
151                         r += ty*tx*read(data[nix + niy*width]);
152
153                         return r;
154                 }
155                 else {
156                         /* Bicubic b-spline interpolation. */
157                         float tx = frac(x*(float)width - 0.5f, &ix);
158                         float ty = frac(y*(float)height - 0.5f, &iy);
159                         int pix, piy, nnix, nniy;
160                         switch(info.extension) {
161                                 case EXTENSION_REPEAT:
162                                         ix = wrap_periodic(ix, width);
163                                         iy = wrap_periodic(iy, height);
164
165                                         pix = wrap_periodic(ix-1, width);
166                                         piy = wrap_periodic(iy-1, height);
167
168                                         nix = wrap_periodic(ix+1, width);
169                                         niy = wrap_periodic(iy+1, height);
170
171                                         nnix = wrap_periodic(ix+2, width);
172                                         nniy = wrap_periodic(iy+2, height);
173                                         break;
174                                 case EXTENSION_CLIP:
175                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
176                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
177                                         }
178                                         ATTR_FALLTHROUGH;
179                                 case EXTENSION_EXTEND:
180                                         pix = wrap_clamp(ix-1, width);
181                                         piy = wrap_clamp(iy-1, height);
182
183                                         nix = wrap_clamp(ix+1, width);
184                                         niy = wrap_clamp(iy+1, height);
185
186                                         nnix = wrap_clamp(ix+2, width);
187                                         nniy = wrap_clamp(iy+2, height);
188
189                                         ix = wrap_clamp(ix, width);
190                                         iy = wrap_clamp(iy, height);
191                                         break;
192                                 default:
193                                         kernel_assert(0);
194                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
195                         }
196
197                         const int xc[4] = {pix, ix, nix, nnix};
198                         const int yc[4] = {width * piy,
199                                            width * iy,
200                                            width * niy,
201                                            width * nniy};
202                         float u[4], v[4];
203                         /* Some helper macro to keep code reasonable size,
204                          * let compiler to inline all the matrix multiplications.
205                          */
206 #define DATA(x, y) (read(data[xc[x] + yc[y]]))
207 #define TERM(col) \
208                         (v[col] * (u[0] * DATA(0, col) + \
209                                    u[1] * DATA(1, col) + \
210                                    u[2] * DATA(2, col) + \
211                                    u[3] * DATA(3, col)))
212
213                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
214                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
215
216                         /* Actual interpolation. */
217                         return TERM(0) + TERM(1) + TERM(2) + TERM(3);
218
219 #undef TERM
220 #undef DATA
221                 }
222         }
223
224         static ccl_always_inline float4 interp_3d_closest(const TextureInfo& info, float x, float y, float z)
225         {
226                 int width = info.width;
227                 int height = info.height;
228                 int depth = info.depth;
229                 int ix, iy, iz;
230
231                 frac(x*(float)width, &ix);
232                 frac(y*(float)height, &iy);
233                 frac(z*(float)depth, &iz);
234
235                 switch(info.extension) {
236                         case EXTENSION_REPEAT:
237                                 ix = wrap_periodic(ix, width);
238                                 iy = wrap_periodic(iy, height);
239                                 iz = wrap_periodic(iz, depth);
240                                 break;
241                         case EXTENSION_CLIP:
242                                 if(x < 0.0f || y < 0.0f || z < 0.0f ||
243                                    x > 1.0f || y > 1.0f || z > 1.0f)
244                                 {
245                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
246                                 }
247                                 ATTR_FALLTHROUGH;
248                         case EXTENSION_EXTEND:
249                                 ix = wrap_clamp(ix, width);
250                                 iy = wrap_clamp(iy, height);
251                                 iz = wrap_clamp(iz, depth);
252                                 break;
253                         default:
254                                 kernel_assert(0);
255                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
256                 }
257
258                 const T *data = (const T*)info.data;
259                 return read(data[ix + iy*width + iz*width*height]);
260         }
261
262         static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info, float x, float y, float z)
263         {
264                 int width = info.width;
265                 int height = info.height;
266                 int depth = info.depth;
267                 int ix, iy, iz;
268                 int nix, niy, niz;
269
270                 float tx = frac(x*(float)width - 0.5f, &ix);
271                 float ty = frac(y*(float)height - 0.5f, &iy);
272                 float tz = frac(z*(float)depth - 0.5f, &iz);
273
274                 switch(info.extension) {
275                         case EXTENSION_REPEAT:
276                                 ix = wrap_periodic(ix, width);
277                                 iy = wrap_periodic(iy, height);
278                                 iz = wrap_periodic(iz, depth);
279
280                                 nix = wrap_periodic(ix+1, width);
281                                 niy = wrap_periodic(iy+1, height);
282                                 niz = wrap_periodic(iz+1, depth);
283                                 break;
284                         case EXTENSION_CLIP:
285                                 if(x < 0.0f || y < 0.0f || z < 0.0f ||
286                                    x > 1.0f || y > 1.0f || z > 1.0f)
287                                 {
288                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
289                                 }
290                                 ATTR_FALLTHROUGH;
291                         case EXTENSION_EXTEND:
292                                 nix = wrap_clamp(ix+1, width);
293                                 niy = wrap_clamp(iy+1, height);
294                                 niz = wrap_clamp(iz+1, depth);
295
296                                 ix = wrap_clamp(ix, width);
297                                 iy = wrap_clamp(iy, height);
298                                 iz = wrap_clamp(iz, depth);
299                                 break;
300                         default:
301                                 kernel_assert(0);
302                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
303                 }
304
305                 const T *data = (const T*)info.data;
306                 float4 r;
307
308                 r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
309                 r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
310                 r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
311                 r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
312
313                 r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
314                 r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
315                 r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
316                 r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
317
318                 return r;
319         }
320
321         /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
322          * causing stack overflow issue in this function unless it is inlined.
323          *
324          * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
325          * enabled.
326          */
327 #ifdef __GNUC__
328         static ccl_always_inline
329 #else
330         static ccl_never_inline
331 #endif
332         float4 interp_3d_tricubic(const TextureInfo& info, float x, float y, float z)
333         {
334                 int width = info.width;
335                 int height = info.height;
336                 int depth = info.depth;
337                 int ix, iy, iz;
338                 int nix, niy, niz;
339                 /* Tricubic b-spline interpolation. */
340                 const float tx = frac(x*(float)width - 0.5f, &ix);
341                 const float ty = frac(y*(float)height - 0.5f, &iy);
342                 const float tz = frac(z*(float)depth - 0.5f, &iz);
343                 int pix, piy, piz, nnix, nniy, nniz;
344
345                 switch(info.extension) {
346                         case EXTENSION_REPEAT:
347                                 ix = wrap_periodic(ix, width);
348                                 iy = wrap_periodic(iy, height);
349                                 iz = wrap_periodic(iz, depth);
350
351                                 pix = wrap_periodic(ix-1, width);
352                                 piy = wrap_periodic(iy-1, height);
353                                 piz = wrap_periodic(iz-1, depth);
354
355                                 nix = wrap_periodic(ix+1, width);
356                                 niy = wrap_periodic(iy+1, height);
357                                 niz = wrap_periodic(iz+1, depth);
358
359                                 nnix = wrap_periodic(ix+2, width);
360                                 nniy = wrap_periodic(iy+2, height);
361                                 nniz = wrap_periodic(iz+2, depth);
362                                 break;
363                         case EXTENSION_CLIP:
364                                 if(x < 0.0f || y < 0.0f || z < 0.0f ||
365                                    x > 1.0f || y > 1.0f || z > 1.0f)
366                                 {
367                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
368                                 }
369                                 ATTR_FALLTHROUGH;
370                         case EXTENSION_EXTEND:
371                                 pix = wrap_clamp(ix-1, width);
372                                 piy = wrap_clamp(iy-1, height);
373                                 piz = wrap_clamp(iz-1, depth);
374
375                                 nix = wrap_clamp(ix+1, width);
376                                 niy = wrap_clamp(iy+1, height);
377                                 niz = wrap_clamp(iz+1, depth);
378
379                                 nnix = wrap_clamp(ix+2, width);
380                                 nniy = wrap_clamp(iy+2, height);
381                                 nniz = wrap_clamp(iz+2, depth);
382
383                                 ix = wrap_clamp(ix, width);
384                                 iy = wrap_clamp(iy, height);
385                                 iz = wrap_clamp(iz, depth);
386                                 break;
387                         default:
388                                 kernel_assert(0);
389                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
390                 }
391
392                 const int xc[4] = {pix, ix, nix, nnix};
393                 const int yc[4] = {width * piy,
394                                    width * iy,
395                                    width * niy,
396                                    width * nniy};
397                 const int zc[4] = {width * height * piz,
398                                    width * height * iz,
399                                    width * height * niz,
400                                    width * height * nniz};
401                 float u[4], v[4], w[4];
402
403                 /* Some helper macro to keep code reasonable size,
404                  * let compiler to inline all the matrix multiplications.
405                  */
406 #define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
407 #define COL_TERM(col, row) \
408                 (v[col] * (u[0] * DATA(0, col, row) + \
409                            u[1] * DATA(1, col, row) + \
410                            u[2] * DATA(2, col, row) + \
411                            u[3] * DATA(3, col, row)))
412 #define ROW_TERM(row) \
413                 (w[row] * (COL_TERM(0, row) + \
414                            COL_TERM(1, row) + \
415                            COL_TERM(2, row) + \
416                            COL_TERM(3, row)))
417
418                 SET_CUBIC_SPLINE_WEIGHTS(u, tx);
419                 SET_CUBIC_SPLINE_WEIGHTS(v, ty);
420                 SET_CUBIC_SPLINE_WEIGHTS(w, tz);
421
422                 /* Actual interpolation. */
423                 const T *data = (const T*)info.data;
424                 return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
425
426 #undef COL_TERM
427 #undef ROW_TERM
428 #undef DATA
429         }
430
431         static ccl_always_inline float4 interp_3d(const TextureInfo& info,
432                                                   float x, float y, float z,
433                                                   InterpolationType interp)
434         {
435                 if(UNLIKELY(!info.data))
436                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
437
438                 switch((interp == INTERPOLATION_NONE)? info.interpolation: interp) {
439                         case INTERPOLATION_CLOSEST:
440                                 return interp_3d_closest(info, x, y, z);
441                         case INTERPOLATION_LINEAR:
442                                 return interp_3d_linear(info, x, y, z);
443                         default:
444                                 return interp_3d_tricubic(info, x, y, z);
445                 }
446         }
447 #undef SET_CUBIC_SPLINE_WEIGHTS
448 };
449
450 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
451 {
452         const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
453
454         switch(kernel_tex_type(id)) {
455                 case IMAGE_DATA_TYPE_HALF:
456                         return TextureInterpolator<half>::interp(info, x, y);
457                 case IMAGE_DATA_TYPE_BYTE:
458                         return TextureInterpolator<uchar>::interp(info, x, y);
459                 case IMAGE_DATA_TYPE_FLOAT:
460                         return TextureInterpolator<float>::interp(info, x, y);
461                 case IMAGE_DATA_TYPE_HALF4:
462                         return TextureInterpolator<half4>::interp(info, x, y);
463                 case IMAGE_DATA_TYPE_BYTE4:
464                         return TextureInterpolator<uchar4>::interp(info, x, y);
465                 case IMAGE_DATA_TYPE_FLOAT4:
466                 default:
467                         return TextureInterpolator<float4>::interp(info, x, y);
468         }
469 }
470
471 ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
472 {
473         const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
474
475         switch(kernel_tex_type(id)) {
476                 case IMAGE_DATA_TYPE_HALF:
477                         return TextureInterpolator<half>::interp_3d(info, x, y, z, interp);
478                 case IMAGE_DATA_TYPE_BYTE:
479                         return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp);
480                 case IMAGE_DATA_TYPE_FLOAT:
481                         return TextureInterpolator<float>::interp_3d(info, x, y, z, interp);
482                 case IMAGE_DATA_TYPE_HALF4:
483                         return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp);
484                 case IMAGE_DATA_TYPE_BYTE4:
485                         return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp);
486                 case IMAGE_DATA_TYPE_FLOAT4:
487                 default:
488                         return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp);
489         }
490 }
491
492 CCL_NAMESPACE_END
493
494 #endif // __KERNEL_CPU_IMAGE_H__