b96a84499b45477aa339d9b53e2d1981ee9e5e1b
[blender-staging.git] / intern / cycles / kernel / kernel_compat_cpu.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifndef __KERNEL_COMPAT_CPU_H__
18 #define __KERNEL_COMPAT_CPU_H__
19
20 #define __KERNEL_CPU__
21
22 /* Release kernel has too much false-positive maybe-uninitialized warnings,
23  * which makes it possible to miss actual warnings.
24  */
25 #if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
26 #  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
27 #  pragma GCC diagnostic ignored "-Wuninitialized"
28 #endif
29
30 /* Selective nodes compilation. */
31 #ifndef __NODES_MAX_GROUP__
32 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
33 #endif
34 #ifndef __NODES_FEATURES__
35 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
36 #endif
37
38 #include "util_debug.h"
39 #include "util_math.h"
40 #include "util_simd.h"
41 #include "util_half.h"
42 #include "util_types.h"
43
44 #define ccl_addr_space
45
46 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
47  * much slower than the double version.  This was fixed in glibc 2.16.
48  */
49 #if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
50      defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
51      (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
52 #  define expf(x) ((float)exp((double)(x)))
53 #endif
54
55 CCL_NAMESPACE_BEGIN
56
57 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
58  * a macro which is empty for other devices */
59
60 #define kernel_assert(cond) assert(cond)
61
62 /* Texture types to be compatible with CUDA textures. These are really just
63  * simple arrays and after inlining fetch hopefully revert to being a simple
64  * pointer lookup. */
65
66 template<typename T> struct texture  {
67         ccl_always_inline T fetch(int index)
68         {
69                 kernel_assert(index >= 0 && index < width);
70                 return data[index];
71         }
72
73 #ifdef __KERNEL_SSE2__
74         ccl_always_inline ssef fetch_ssef(int index)
75         {
76                 kernel_assert(index >= 0 && index < width);
77                 return ((ssef*)data)[index];
78         }
79
80         ccl_always_inline ssei fetch_ssei(int index)
81         {
82                 kernel_assert(index >= 0 && index < width);
83                 return ((ssei*)data)[index];
84         }
85 #endif
86
87         T *data;
88         int width;
89 };
90
91 template<typename T> struct texture_image  {
92 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
93         { \
94                 u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
95                 u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
96                 u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
97                 u[3] = (1.0f / 6.0f) * t * t * t; \
98         } (void)0
99
100         ccl_always_inline float4 read(float4 r)
101         {
102                 return r;
103         }
104
105         ccl_always_inline float4 read(uchar4 r)
106         {
107                 float f = 1.0f/255.0f;
108                 return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
109         }
110
111         ccl_always_inline int wrap_periodic(int x, int width)
112         {
113                 x %= width;
114                 if(x < 0)
115                         x += width;
116                 return x;
117         }
118
119         ccl_always_inline int wrap_clamp(int x, int width)
120         {
121                 return clamp(x, 0, width-1);
122         }
123
124         ccl_always_inline float frac(float x, int *ix)
125         {
126                 int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
127                 *ix = i;
128                 return x - (float)i;
129         }
130
131         ccl_always_inline float4 interp(float x, float y)
132         {
133                 if(UNLIKELY(!data))
134                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
135
136                 int ix, iy, nix, niy;
137
138                 if(interpolation == INTERPOLATION_CLOSEST) {
139                         frac(x*(float)width, &ix);
140                         frac(y*(float)height, &iy);
141                         switch(extension) {
142                                 case EXTENSION_REPEAT:
143                                         ix = wrap_periodic(ix, width);
144                                         iy = wrap_periodic(iy, height);
145                                         break;
146                                 case EXTENSION_CLIP:
147                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
148                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
149                                         }
150                                         /* Fall through. */
151                                 case EXTENSION_EXTEND:
152                                         ix = wrap_clamp(ix, width);
153                                         iy = wrap_clamp(iy, height);
154                                         break;
155                                 default:
156                                         kernel_assert(0);
157                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
158                         }
159                         return read(data[ix + iy*width]);
160                 }
161                 else if(interpolation == INTERPOLATION_LINEAR) {
162                         float tx = frac(x*(float)width - 0.5f, &ix);
163                         float ty = frac(y*(float)height - 0.5f, &iy);
164
165                         switch(extension) {
166                                 case EXTENSION_REPEAT:
167                                         ix = wrap_periodic(ix, width);
168                                         iy = wrap_periodic(iy, height);
169
170                                         nix = wrap_periodic(ix+1, width);
171                                         niy = wrap_periodic(iy+1, height);
172                                         break;
173                                 case EXTENSION_CLIP:
174                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
175                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
176                                         }
177                                         /* Fall through. */
178                                 case EXTENSION_EXTEND:
179                                         nix = wrap_clamp(ix+1, width);
180                                         niy = wrap_clamp(iy+1, height);
181
182                                         ix = wrap_clamp(ix, width);
183                                         iy = wrap_clamp(iy, height);
184                                         break;
185                                 default:
186                                         kernel_assert(0);
187                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
188                         }
189
190                         float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
191                         r += (1.0f - ty)*tx*read(data[nix + iy*width]);
192                         r += ty*(1.0f - tx)*read(data[ix + niy*width]);
193                         r += ty*tx*read(data[nix + niy*width]);
194
195                         return r;
196                 }
197                 else {
198                         /* Bicubic b-spline interpolation. */
199                         float tx = frac(x*(float)width - 0.5f, &ix);
200                         float ty = frac(y*(float)height - 0.5f, &iy);
201                         int pix, piy, nnix, nniy;
202                         switch(extension) {
203                                 case EXTENSION_REPEAT:
204                                         ix = wrap_periodic(ix, width);
205                                         iy = wrap_periodic(iy, height);
206
207                                         pix = wrap_periodic(ix-1, width);
208                                         piy = wrap_periodic(iy-1, height);
209
210                                         nix = wrap_periodic(ix+1, width);
211                                         niy = wrap_periodic(iy+1, height);
212
213                                         nnix = wrap_periodic(ix+2, width);
214                                         nniy = wrap_periodic(iy+2, height);
215                                         break;
216                                 case EXTENSION_CLIP:
217                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
218                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
219                                         }
220                                         /* Fall through. */
221                                 case EXTENSION_EXTEND:
222                                         pix = wrap_clamp(ix-1, width);
223                                         piy = wrap_clamp(iy-1, height);
224
225                                         nix = wrap_clamp(ix+1, width);
226                                         niy = wrap_clamp(iy+1, height);
227
228                                         nnix = wrap_clamp(ix+2, width);
229                                         nniy = wrap_clamp(iy+2, height);
230
231                                         ix = wrap_clamp(ix, width);
232                                         iy = wrap_clamp(iy, height);
233                                         break;
234                                 default:
235                                         kernel_assert(0);
236                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
237                         }
238
239                         const int xc[4] = {pix, ix, nix, nnix};
240                         const int yc[4] = {width * piy,
241                                            width * iy,
242                                            width * niy,
243                                            width * nniy};
244                         float u[4], v[4];
245                         /* Some helper macro to keep code reasonable size,
246                          * let compiler to inline all the matrix multiplications.
247                          */
248 #define DATA(x, y) (read(data[xc[x] + yc[y]]))
249 #define TERM(col) \
250                         (v[col] * (u[0] * DATA(0, col) + \
251                                    u[1] * DATA(1, col) + \
252                                    u[2] * DATA(2, col) + \
253                                    u[3] * DATA(3, col)))
254
255                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
256                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
257
258                         /* Actual interpolation. */
259                         return TERM(0) + TERM(1) + TERM(2) + TERM(3);
260
261 #undef TERM
262 #undef DATA
263                 }
264         }
265
266         ccl_always_inline float4 interp_3d(float x, float y, float z)
267         {
268                 return interp_3d_ex(x, y, z, interpolation);
269         }
270
271         ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
272                                               int interpolation = INTERPOLATION_LINEAR)
273         {
274                 if(UNLIKELY(!data))
275                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
276
277                 int ix, iy, iz, nix, niy, niz;
278
279                 if(interpolation == INTERPOLATION_CLOSEST) {
280                         frac(x*(float)width, &ix);
281                         frac(y*(float)height, &iy);
282                         frac(z*(float)depth, &iz);
283
284                         switch(extension) {
285                                 case EXTENSION_REPEAT:
286                                         ix = wrap_periodic(ix, width);
287                                         iy = wrap_periodic(iy, height);
288                                         iz = wrap_periodic(iz, depth);
289                                         break;
290                                 case EXTENSION_CLIP:
291                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
292                                            x > 1.0f || y > 1.0f || z > 1.0f)
293                                         {
294                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
295                                         }
296                                         /* Fall through. */
297                                 case EXTENSION_EXTEND:
298                                         ix = wrap_clamp(ix, width);
299                                         iy = wrap_clamp(iy, height);
300                                         iz = wrap_clamp(iz, depth);
301                                         break;
302                                 default:
303                                         kernel_assert(0);
304                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
305                         }
306
307                         return read(data[ix + iy*width + iz*width*height]);
308                 }
309                 else if(interpolation == INTERPOLATION_LINEAR) {
310                         float tx = frac(x*(float)width - 0.5f, &ix);
311                         float ty = frac(y*(float)height - 0.5f, &iy);
312                         float tz = frac(z*(float)depth - 0.5f, &iz);
313
314                         switch(extension) {
315                                 case EXTENSION_REPEAT:
316                                         ix = wrap_periodic(ix, width);
317                                         iy = wrap_periodic(iy, height);
318                                         iz = wrap_periodic(iz, depth);
319
320                                         nix = wrap_periodic(ix+1, width);
321                                         niy = wrap_periodic(iy+1, height);
322                                         niz = wrap_periodic(iz+1, depth);
323                                         break;
324                                 case EXTENSION_CLIP:
325                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
326                                            x > 1.0f || y > 1.0f || z > 1.0f)
327                                         {
328                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
329                                         }
330                                         /* Fall through. */
331                                 case EXTENSION_EXTEND:
332                                         nix = wrap_clamp(ix+1, width);
333                                         niy = wrap_clamp(iy+1, height);
334                                         niz = wrap_clamp(iz+1, depth);
335
336                                         ix = wrap_clamp(ix, width);
337                                         iy = wrap_clamp(iy, height);
338                                         iz = wrap_clamp(iz, depth);
339                                         break;
340                                 default:
341                                         kernel_assert(0);
342                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
343                         }
344
345                         float4 r;
346
347                         r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
348                         r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
349                         r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
350                         r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
351
352                         r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
353                         r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
354                         r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
355                         r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
356
357                         return r;
358                 }
359                 else {
360                         /* Tricubic b-spline interpolation. */
361                         const float tx = frac(x*(float)width - 0.5f, &ix);
362                         const float ty = frac(y*(float)height - 0.5f, &iy);
363                         const float tz = frac(z*(float)depth - 0.5f, &iz);
364                         int pix, piy, piz, nnix, nniy, nniz;
365
366                         switch(extension) {
367                                 case EXTENSION_REPEAT:
368                                         ix = wrap_periodic(ix, width);
369                                         iy = wrap_periodic(iy, height);
370                                         iz = wrap_periodic(iz, depth);
371
372                                         pix = wrap_periodic(ix-1, width);
373                                         piy = wrap_periodic(iy-1, height);
374                                         piz = wrap_periodic(iz-1, depth);
375
376                                         nix = wrap_periodic(ix+1, width);
377                                         niy = wrap_periodic(iy+1, height);
378                                         niz = wrap_periodic(iz+1, depth);
379
380                                         nnix = wrap_periodic(ix+2, width);
381                                         nniy = wrap_periodic(iy+2, height);
382                                         nniz = wrap_periodic(iz+2, depth);
383                                         break;
384                                 case EXTENSION_CLIP:
385                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
386                                            x > 1.0f || y > 1.0f || z > 1.0f)
387                                         {
388                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
389                                         }
390                                         /* Fall through. */
391                                 case EXTENSION_EXTEND:
392                                         pix = wrap_clamp(ix-1, width);
393                                         piy = wrap_clamp(iy-1, height);
394                                         piz = wrap_clamp(iz-1, depth);
395
396                                         nix = wrap_clamp(ix+1, width);
397                                         niy = wrap_clamp(iy+1, height);
398                                         niz = wrap_clamp(iz+1, depth);
399
400                                         nnix = wrap_clamp(ix+2, width);
401                                         nniy = wrap_clamp(iy+2, height);
402                                         nniz = wrap_clamp(iz+2, depth);
403
404                                         ix = wrap_clamp(ix, width);
405                                         iy = wrap_clamp(iy, height);
406                                         iz = wrap_clamp(iz, depth);
407                                         break;
408                                 default:
409                                         kernel_assert(0);
410                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
411                         }
412
413                         const int xc[4] = {pix, ix, nix, nnix};
414                         const int yc[4] = {width * piy,
415                                            width * iy,
416                                            width * niy,
417                                            width * nniy};
418                         const int zc[4] = {width * height * piz,
419                                            width * height * iz,
420                                            width * height * niz,
421                                            width * height * nniz};
422                         float u[4], v[4], w[4];
423
424                         /* Some helper macro to keep code reasonable size,
425                          * let compiler to inline all the matrix multiplications.
426                          */
427 #define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
428 #define COL_TERM(col, row) \
429                         (v[col] * (u[0] * DATA(0, col, row) + \
430                                    u[1] * DATA(1, col, row) + \
431                                    u[2] * DATA(2, col, row) + \
432                                    u[3] * DATA(3, col, row)))
433 #define ROW_TERM(row) \
434                         (w[row] * (COL_TERM(0, row) + \
435                                    COL_TERM(1, row) + \
436                                    COL_TERM(2, row) + \
437                                    COL_TERM(3, row)))
438
439                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
440                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
441                         SET_CUBIC_SPLINE_WEIGHTS(w, tz);
442
443                         /* Actual interpolation. */
444                         return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
445
446 #undef COL_TERM
447 #undef ROW_TERM
448 #undef DATA
449                 }
450         }
451
452         ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
453         {
454                 width = width_;
455                 height = height_;
456                 depth = depth_;
457         }
458
459         T *data;
460         int interpolation;
461         ExtensionType extension;
462         int width, height, depth;
463 #undef SET_CUBIC_SPLINE_WEIGHTS
464 };
465
466 typedef texture<float4> texture_float4;
467 typedef texture<float2> texture_float2;
468 typedef texture<float> texture_float;
469 typedef texture<uint> texture_uint;
470 typedef texture<int> texture_int;
471 typedef texture<uint4> texture_uint4;
472 typedef texture<uchar4> texture_uchar4;
473 typedef texture_image<float4> texture_image_float4;
474 typedef texture_image<uchar4> texture_image_uchar4;
475
476 /* Macros to handle different memory storage on different devices */
477
478 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
479 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
480 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
481 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
482 #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
483 #define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
484 #define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
485
486 #define kernel_data (kg->__data)
487
488 #ifdef __KERNEL_SSE2__
489 typedef vector3<sseb> sse3b;
490 typedef vector3<ssef> sse3f;
491 typedef vector3<ssei> sse3i;
492
493 ccl_device_inline void print_sse3b(const char *label, sse3b& a)
494 {
495         print_sseb(label, a.x);
496         print_sseb(label, a.y);
497         print_sseb(label, a.z);
498 }
499
500 ccl_device_inline void print_sse3f(const char *label, sse3f& a)
501 {
502         print_ssef(label, a.x);
503         print_ssef(label, a.y);
504         print_ssef(label, a.z);
505 }
506
507 ccl_device_inline void print_sse3i(const char *label, sse3i& a)
508 {
509         print_ssei(label, a.x);
510         print_ssei(label, a.y);
511         print_ssei(label, a.z);
512 }
513
514 #endif
515
516 CCL_NAMESPACE_END
517
518 #endif /* __KERNEL_COMPAT_CPU_H__ */
519