Cycles: Fix long compile time with MSVC.
[blender.git] / intern / cycles / kernel / kernel_compat_cpu.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifndef __KERNEL_COMPAT_CPU_H__
18 #define __KERNEL_COMPAT_CPU_H__
19
20 #define __KERNEL_CPU__
21
22 /* Release kernel has too much false-positive maybe-uninitialized warnings,
23  * which makes it possible to miss actual warnings.
24  */
25 #if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
26 #  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
27 #  pragma GCC diagnostic ignored "-Wuninitialized"
28 #endif
29
30 /* Selective nodes compilation. */
31 #ifndef __NODES_MAX_GROUP__
32 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
33 #endif
34 #ifndef __NODES_FEATURES__
35 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
36 #endif
37
38 #include "util_debug.h"
39 #include "util_math.h"
40 #include "util_simd.h"
41 #include "util_half.h"
42 #include "util_types.h"
43 #include "util_texture.h"
44
45 #define ccl_addr_space
46
47 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
48  * much slower than the double version.  This was fixed in glibc 2.16.
49  */
50 #if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
51      defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
52      (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
53 #  define expf(x) ((float)exp((double)(x)))
54 #endif
55
56 CCL_NAMESPACE_BEGIN
57
58 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
59  * a macro which is empty for other devices */
60
61 #define kernel_assert(cond) assert(cond)
62
63 /* Texture types to be compatible with CUDA textures. These are really just
64  * simple arrays and after inlining fetch hopefully revert to being a simple
65  * pointer lookup. */
66
67 template<typename T> struct texture  {
68         ccl_always_inline T fetch(int index)
69         {
70                 kernel_assert(index >= 0 && index < width);
71                 return data[index];
72         }
73
74 #ifdef __KERNEL_SSE2__
75         ccl_always_inline ssef fetch_ssef(int index)
76         {
77                 kernel_assert(index >= 0 && index < width);
78                 return ((ssef*)data)[index];
79         }
80
81         ccl_always_inline ssei fetch_ssei(int index)
82         {
83                 kernel_assert(index >= 0 && index < width);
84                 return ((ssei*)data)[index];
85         }
86 #endif
87
88         T *data;
89         int width;
90 };
91
92 template<typename T> struct texture_image  {
93 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
94         { \
95                 u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
96                 u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
97                 u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
98                 u[3] = (1.0f / 6.0f) * t * t * t; \
99         } (void)0
100
101         ccl_always_inline float4 read(float4 r)
102         {
103                 return r;
104         }
105
106         ccl_always_inline float4 read(uchar4 r)
107         {
108                 float f = 1.0f/255.0f;
109                 return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
110         }
111
112         ccl_always_inline float4 read(uchar r)
113         {
114                 float f = r*(1.0f/255.0f);
115                 return make_float4(f, f, f, 1.0);
116         }
117
118         ccl_always_inline float4 read(float r)
119         {
120                 /* TODO(dingto): Optimize this, so interpolation
121                  * happens on float instead of float4 */
122                 return make_float4(r, r, r, 1.0f);
123         }
124
125         ccl_always_inline int wrap_periodic(int x, int width)
126         {
127                 x %= width;
128                 if(x < 0)
129                         x += width;
130                 return x;
131         }
132
133         ccl_always_inline int wrap_clamp(int x, int width)
134         {
135                 return clamp(x, 0, width-1);
136         }
137
138         ccl_always_inline float frac(float x, int *ix)
139         {
140                 int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
141                 *ix = i;
142                 return x - (float)i;
143         }
144
145         ccl_always_inline float4 interp(float x, float y)
146         {
147                 if(UNLIKELY(!data))
148                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
149
150                 int ix, iy, nix, niy;
151
152                 if(interpolation == INTERPOLATION_CLOSEST) {
153                         frac(x*(float)width, &ix);
154                         frac(y*(float)height, &iy);
155                         switch(extension) {
156                                 case EXTENSION_REPEAT:
157                                         ix = wrap_periodic(ix, width);
158                                         iy = wrap_periodic(iy, height);
159                                         break;
160                                 case EXTENSION_CLIP:
161                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
162                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
163                                         }
164                                         /* Fall through. */
165                                 case EXTENSION_EXTEND:
166                                         ix = wrap_clamp(ix, width);
167                                         iy = wrap_clamp(iy, height);
168                                         break;
169                                 default:
170                                         kernel_assert(0);
171                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
172                         }
173                         return read(data[ix + iy*width]);
174                 }
175                 else if(interpolation == INTERPOLATION_LINEAR) {
176                         float tx = frac(x*(float)width - 0.5f, &ix);
177                         float ty = frac(y*(float)height - 0.5f, &iy);
178
179                         switch(extension) {
180                                 case EXTENSION_REPEAT:
181                                         ix = wrap_periodic(ix, width);
182                                         iy = wrap_periodic(iy, height);
183
184                                         nix = wrap_periodic(ix+1, width);
185                                         niy = wrap_periodic(iy+1, height);
186                                         break;
187                                 case EXTENSION_CLIP:
188                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
189                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
190                                         }
191                                         /* Fall through. */
192                                 case EXTENSION_EXTEND:
193                                         nix = wrap_clamp(ix+1, width);
194                                         niy = wrap_clamp(iy+1, height);
195
196                                         ix = wrap_clamp(ix, width);
197                                         iy = wrap_clamp(iy, height);
198                                         break;
199                                 default:
200                                         kernel_assert(0);
201                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
202                         }
203
204                         float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
205                         r += (1.0f - ty)*tx*read(data[nix + iy*width]);
206                         r += ty*(1.0f - tx)*read(data[ix + niy*width]);
207                         r += ty*tx*read(data[nix + niy*width]);
208
209                         return r;
210                 }
211                 else {
212                         /* Bicubic b-spline interpolation. */
213                         float tx = frac(x*(float)width - 0.5f, &ix);
214                         float ty = frac(y*(float)height - 0.5f, &iy);
215                         int pix, piy, nnix, nniy;
216                         switch(extension) {
217                                 case EXTENSION_REPEAT:
218                                         ix = wrap_periodic(ix, width);
219                                         iy = wrap_periodic(iy, height);
220
221                                         pix = wrap_periodic(ix-1, width);
222                                         piy = wrap_periodic(iy-1, height);
223
224                                         nix = wrap_periodic(ix+1, width);
225                                         niy = wrap_periodic(iy+1, height);
226
227                                         nnix = wrap_periodic(ix+2, width);
228                                         nniy = wrap_periodic(iy+2, height);
229                                         break;
230                                 case EXTENSION_CLIP:
231                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
232                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
233                                         }
234                                         /* Fall through. */
235                                 case EXTENSION_EXTEND:
236                                         pix = wrap_clamp(ix-1, width);
237                                         piy = wrap_clamp(iy-1, height);
238
239                                         nix = wrap_clamp(ix+1, width);
240                                         niy = wrap_clamp(iy+1, height);
241
242                                         nnix = wrap_clamp(ix+2, width);
243                                         nniy = wrap_clamp(iy+2, height);
244
245                                         ix = wrap_clamp(ix, width);
246                                         iy = wrap_clamp(iy, height);
247                                         break;
248                                 default:
249                                         kernel_assert(0);
250                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
251                         }
252
253                         const int xc[4] = {pix, ix, nix, nnix};
254                         const int yc[4] = {width * piy,
255                                            width * iy,
256                                            width * niy,
257                                            width * nniy};
258                         float u[4], v[4];
259                         /* Some helper macro to keep code reasonable size,
260                          * let compiler to inline all the matrix multiplications.
261                          */
262 #define DATA(x, y) (read(data[xc[x] + yc[y]]))
263 #define TERM(col) \
264                         (v[col] * (u[0] * DATA(0, col) + \
265                                    u[1] * DATA(1, col) + \
266                                    u[2] * DATA(2, col) + \
267                                    u[3] * DATA(3, col)))
268
269                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
270                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
271
272                         /* Actual interpolation. */
273                         return TERM(0) + TERM(1) + TERM(2) + TERM(3);
274
275 #undef TERM
276 #undef DATA
277                 }
278         }
279
280         ccl_always_inline float4 interp_3d(float x, float y, float z)
281         {
282                 return interp_3d_ex(x, y, z, interpolation);
283         }
284
285         ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
286                                               int interpolation = INTERPOLATION_LINEAR)
287         {
288                 if(UNLIKELY(!data))
289                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
290
291                 int ix, iy, iz, nix, niy, niz;
292
293                 if(interpolation == INTERPOLATION_CLOSEST) {
294                         frac(x*(float)width, &ix);
295                         frac(y*(float)height, &iy);
296                         frac(z*(float)depth, &iz);
297
298                         switch(extension) {
299                                 case EXTENSION_REPEAT:
300                                         ix = wrap_periodic(ix, width);
301                                         iy = wrap_periodic(iy, height);
302                                         iz = wrap_periodic(iz, depth);
303                                         break;
304                                 case EXTENSION_CLIP:
305                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
306                                            x > 1.0f || y > 1.0f || z > 1.0f)
307                                         {
308                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
309                                         }
310                                         /* Fall through. */
311                                 case EXTENSION_EXTEND:
312                                         ix = wrap_clamp(ix, width);
313                                         iy = wrap_clamp(iy, height);
314                                         iz = wrap_clamp(iz, depth);
315                                         break;
316                                 default:
317                                         kernel_assert(0);
318                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
319                         }
320
321                         return read(data[ix + iy*width + iz*width*height]);
322                 }
323                 else if(interpolation == INTERPOLATION_LINEAR) {
324                         float tx = frac(x*(float)width - 0.5f, &ix);
325                         float ty = frac(y*(float)height - 0.5f, &iy);
326                         float tz = frac(z*(float)depth - 0.5f, &iz);
327
328                         switch(extension) {
329                                 case EXTENSION_REPEAT:
330                                         ix = wrap_periodic(ix, width);
331                                         iy = wrap_periodic(iy, height);
332                                         iz = wrap_periodic(iz, depth);
333
334                                         nix = wrap_periodic(ix+1, width);
335                                         niy = wrap_periodic(iy+1, height);
336                                         niz = wrap_periodic(iz+1, depth);
337                                         break;
338                                 case EXTENSION_CLIP:
339                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
340                                            x > 1.0f || y > 1.0f || z > 1.0f)
341                                         {
342                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
343                                         }
344                                         /* Fall through. */
345                                 case EXTENSION_EXTEND:
346                                         nix = wrap_clamp(ix+1, width);
347                                         niy = wrap_clamp(iy+1, height);
348                                         niz = wrap_clamp(iz+1, depth);
349
350                                         ix = wrap_clamp(ix, width);
351                                         iy = wrap_clamp(iy, height);
352                                         iz = wrap_clamp(iz, depth);
353                                         break;
354                                 default:
355                                         kernel_assert(0);
356                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
357                         }
358
359                         float4 r;
360
361                         r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
362                         r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
363                         r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
364                         r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
365
366                         r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
367                         r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
368                         r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
369                         r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
370
371                         return r;
372                 }
373                 else {
374                         /* Tricubic b-spline interpolation. */
375                         const float tx = frac(x*(float)width - 0.5f, &ix);
376                         const float ty = frac(y*(float)height - 0.5f, &iy);
377                         const float tz = frac(z*(float)depth - 0.5f, &iz);
378                         int pix, piy, piz, nnix, nniy, nniz;
379
380                         switch(extension) {
381                                 case EXTENSION_REPEAT:
382                                         ix = wrap_periodic(ix, width);
383                                         iy = wrap_periodic(iy, height);
384                                         iz = wrap_periodic(iz, depth);
385
386                                         pix = wrap_periodic(ix-1, width);
387                                         piy = wrap_periodic(iy-1, height);
388                                         piz = wrap_periodic(iz-1, depth);
389
390                                         nix = wrap_periodic(ix+1, width);
391                                         niy = wrap_periodic(iy+1, height);
392                                         niz = wrap_periodic(iz+1, depth);
393
394                                         nnix = wrap_periodic(ix+2, width);
395                                         nniy = wrap_periodic(iy+2, height);
396                                         nniz = wrap_periodic(iz+2, depth);
397                                         break;
398                                 case EXTENSION_CLIP:
399                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
400                                            x > 1.0f || y > 1.0f || z > 1.0f)
401                                         {
402                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
403                                         }
404                                         /* Fall through. */
405                                 case EXTENSION_EXTEND:
406                                         pix = wrap_clamp(ix-1, width);
407                                         piy = wrap_clamp(iy-1, height);
408                                         piz = wrap_clamp(iz-1, depth);
409
410                                         nix = wrap_clamp(ix+1, width);
411                                         niy = wrap_clamp(iy+1, height);
412                                         niz = wrap_clamp(iz+1, depth);
413
414                                         nnix = wrap_clamp(ix+2, width);
415                                         nniy = wrap_clamp(iy+2, height);
416                                         nniz = wrap_clamp(iz+2, depth);
417
418                                         ix = wrap_clamp(ix, width);
419                                         iy = wrap_clamp(iy, height);
420                                         iz = wrap_clamp(iz, depth);
421                                         break;
422                                 default:
423                                         kernel_assert(0);
424                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
425                         }
426
427                         const int xc[4] = {pix, ix, nix, nnix};
428                         const int yc[4] = {width * piy,
429                                            width * iy,
430                                            width * niy,
431                                            width * nniy};
432                         const int zc[4] = {width * height * piz,
433                                            width * height * iz,
434                                            width * height * niz,
435                                            width * height * nniz};
436                         float u[4], v[4], w[4];
437
438                         /* Some helper macro to keep code reasonable size,
439                          * let compiler to inline all the matrix multiplications.
440                          */
441 #define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
442 #define COL_TERM(col, row) \
443                         (v[col] * (u[0] * DATA(0, col, row) + \
444                                    u[1] * DATA(1, col, row) + \
445                                    u[2] * DATA(2, col, row) + \
446                                    u[3] * DATA(3, col, row)))
447 #define ROW_TERM(row) \
448                         (w[row] * (COL_TERM(0, row) + \
449                                    COL_TERM(1, row) + \
450                                    COL_TERM(2, row) + \
451                                    COL_TERM(3, row)))
452
453                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
454                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
455                         SET_CUBIC_SPLINE_WEIGHTS(w, tz);
456
457                         /* Actual interpolation. */
458                         return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
459
460 #undef COL_TERM
461 #undef ROW_TERM
462 #undef DATA
463                 }
464         }
465
466         ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
467         {
468                 width = width_;
469                 height = height_;
470                 depth = depth_;
471         }
472
473         T *data;
474         int interpolation;
475         ExtensionType extension;
476         int width, height, depth;
477 #undef SET_CUBIC_SPLINE_WEIGHTS
478 };
479
480 typedef texture<float4> texture_float4;
481 typedef texture<float2> texture_float2;
482 typedef texture<float> texture_float;
483 typedef texture<uint> texture_uint;
484 typedef texture<int> texture_int;
485 typedef texture<uint4> texture_uint4;
486 typedef texture<uchar4> texture_uchar4;
487 typedef texture_image<float> texture_image_float;
488 typedef texture_image<uchar> texture_image_uchar;
489 typedef texture_image<float4> texture_image_float4;
490 typedef texture_image<uchar4> texture_image_uchar4;
491
492 /* Macros to handle different memory storage on different devices */
493
494 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
495 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
496 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
497 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
498
499 #define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y)
500 #define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z)
501 #define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation)
502
503 #define kernel_data (kg->__data)
504
505 #ifdef __KERNEL_SSE2__
506 typedef vector3<sseb> sse3b;
507 typedef vector3<ssef> sse3f;
508 typedef vector3<ssei> sse3i;
509
510 ccl_device_inline void print_sse3b(const char *label, sse3b& a)
511 {
512         print_sseb(label, a.x);
513         print_sseb(label, a.y);
514         print_sseb(label, a.z);
515 }
516
517 ccl_device_inline void print_sse3f(const char *label, sse3f& a)
518 {
519         print_ssef(label, a.x);
520         print_ssef(label, a.y);
521         print_ssef(label, a.z);
522 }
523
524 ccl_device_inline void print_sse3i(const char *label, sse3i& a)
525 {
526         print_ssei(label, a.x);
527         print_ssei(label, a.y);
528         print_ssei(label, a.z);
529 }
530
531 #endif
532
533 CCL_NAMESPACE_END
534
535 #endif /* __KERNEL_COMPAT_CPU_H__ */
536