Cycles: Make all #include statements relative to cycles source directory
[blender.git] / intern / cycles / kernel / kernel_compat_cpu.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifndef __KERNEL_COMPAT_CPU_H__
18 #define __KERNEL_COMPAT_CPU_H__
19
20 #define __KERNEL_CPU__
21
22 /* Release kernel has too much false-positive maybe-uninitialized warnings,
23  * which makes it possible to miss actual warnings.
24  */
25 #if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
26 #  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
27 #  pragma GCC diagnostic ignored "-Wuninitialized"
28 #endif
29
30 /* Selective nodes compilation. */
31 #ifndef __NODES_MAX_GROUP__
32 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
33 #endif
34 #ifndef __NODES_FEATURES__
35 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
36 #endif
37
38 #include "util/util_debug.h"
39 #include "util/util_math.h"
40 #include "util/util_simd.h"
41 #include "util/util_half.h"
42 #include "util/util_types.h"
43 #include "util/util_texture.h"
44
45 #define ccl_addr_space
46
47 #define ccl_local_id(d) 0
48 #define ccl_global_id(d) (kg->global_id[d])
49
50 #define ccl_local_size(d) 1
51 #define ccl_global_size(d) (kg->global_size[d])
52
53 #define ccl_group_id(d) ccl_global_id(d)
54 #define ccl_num_groups(d) ccl_global_size(d)
55
56 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
57  * much slower than the double version.  This was fixed in glibc 2.16.
58  */
59 #if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
60      defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
61      (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
62 #  define expf(x) ((float)exp((double)(x)))
63 #endif
64
65 CCL_NAMESPACE_BEGIN
66
67 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
68  * a macro which is empty for other devices */
69
70 #define kernel_assert(cond) assert(cond)
71
72 /* Texture types to be compatible with CUDA textures. These are really just
73  * simple arrays and after inlining fetch hopefully revert to being a simple
74  * pointer lookup. */
75
76 template<typename T> struct texture  {
77         ccl_always_inline T fetch(int index)
78         {
79                 kernel_assert(index >= 0 && index < width);
80                 return data[index];
81         }
82
83 #ifdef __KERNEL_AVX__
84         /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
85          * compatibility with existing indicies and data structures.
86          */
87         ccl_always_inline avxf fetch_avxf(const int index)
88         {
89                 kernel_assert(index >= 0 && (index+1) < width);
90                 ssef *ssef_data = (ssef*)data;
91                 ssef *ssef_node_data = &ssef_data[index];
92                 return _mm256_loadu_ps((float *)ssef_node_data);
93         }
94
95 #endif
96
97 #ifdef __KERNEL_SSE2__
98         ccl_always_inline ssef fetch_ssef(int index)
99         {
100                 kernel_assert(index >= 0 && index < width);
101                 return ((ssef*)data)[index];
102         }
103
104         ccl_always_inline ssei fetch_ssei(int index)
105         {
106                 kernel_assert(index >= 0 && index < width);
107                 return ((ssei*)data)[index];
108         }
109 #endif
110
111         T *data;
112         int width;
113 };
114
115 template<typename T> struct texture_image  {
116 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
117         { \
118                 u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
119                 u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
120                 u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
121                 u[3] = (1.0f / 6.0f) * t * t * t; \
122         } (void)0
123
124         ccl_always_inline float4 read(float4 r)
125         {
126                 return r;
127         }
128
129         ccl_always_inline float4 read(uchar4 r)
130         {
131                 float f = 1.0f/255.0f;
132                 return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
133         }
134
135         ccl_always_inline float4 read(uchar r)
136         {
137                 float f = r*(1.0f/255.0f);
138                 return make_float4(f, f, f, 1.0f);
139         }
140
141         ccl_always_inline float4 read(float r)
142         {
143                 /* TODO(dingto): Optimize this, so interpolation
144                  * happens on float instead of float4 */
145                 return make_float4(r, r, r, 1.0f);
146         }
147
148         ccl_always_inline float4 read(half4 r)
149         {
150                 return half4_to_float4(r);
151         }
152
153         ccl_always_inline float4 read(half r)
154         {
155                 float f = half_to_float(r);
156                 return make_float4(f, f, f, 1.0f);
157         }
158
159         ccl_always_inline int wrap_periodic(int x, int width)
160         {
161                 x %= width;
162                 if(x < 0)
163                         x += width;
164                 return x;
165         }
166
167         ccl_always_inline int wrap_clamp(int x, int width)
168         {
169                 return clamp(x, 0, width-1);
170         }
171
172         ccl_always_inline float frac(float x, int *ix)
173         {
174                 int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
175                 *ix = i;
176                 return x - (float)i;
177         }
178
179         ccl_always_inline float4 interp(float x, float y)
180         {
181                 if(UNLIKELY(!data))
182                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
183
184                 int ix, iy, nix, niy;
185
186                 if(interpolation == INTERPOLATION_CLOSEST) {
187                         frac(x*(float)width, &ix);
188                         frac(y*(float)height, &iy);
189                         switch(extension) {
190                                 case EXTENSION_REPEAT:
191                                         ix = wrap_periodic(ix, width);
192                                         iy = wrap_periodic(iy, height);
193                                         break;
194                                 case EXTENSION_CLIP:
195                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
196                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
197                                         }
198                                         /* Fall through. */
199                                 case EXTENSION_EXTEND:
200                                         ix = wrap_clamp(ix, width);
201                                         iy = wrap_clamp(iy, height);
202                                         break;
203                                 default:
204                                         kernel_assert(0);
205                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
206                         }
207                         return read(data[ix + iy*width]);
208                 }
209                 else if(interpolation == INTERPOLATION_LINEAR) {
210                         float tx = frac(x*(float)width - 0.5f, &ix);
211                         float ty = frac(y*(float)height - 0.5f, &iy);
212
213                         switch(extension) {
214                                 case EXTENSION_REPEAT:
215                                         ix = wrap_periodic(ix, width);
216                                         iy = wrap_periodic(iy, height);
217
218                                         nix = wrap_periodic(ix+1, width);
219                                         niy = wrap_periodic(iy+1, height);
220                                         break;
221                                 case EXTENSION_CLIP:
222                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
223                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
224                                         }
225                                         /* Fall through. */
226                                 case EXTENSION_EXTEND:
227                                         nix = wrap_clamp(ix+1, width);
228                                         niy = wrap_clamp(iy+1, height);
229
230                                         ix = wrap_clamp(ix, width);
231                                         iy = wrap_clamp(iy, height);
232                                         break;
233                                 default:
234                                         kernel_assert(0);
235                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
236                         }
237
238                         float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
239                         r += (1.0f - ty)*tx*read(data[nix + iy*width]);
240                         r += ty*(1.0f - tx)*read(data[ix + niy*width]);
241                         r += ty*tx*read(data[nix + niy*width]);
242
243                         return r;
244                 }
245                 else {
246                         /* Bicubic b-spline interpolation. */
247                         float tx = frac(x*(float)width - 0.5f, &ix);
248                         float ty = frac(y*(float)height - 0.5f, &iy);
249                         int pix, piy, nnix, nniy;
250                         switch(extension) {
251                                 case EXTENSION_REPEAT:
252                                         ix = wrap_periodic(ix, width);
253                                         iy = wrap_periodic(iy, height);
254
255                                         pix = wrap_periodic(ix-1, width);
256                                         piy = wrap_periodic(iy-1, height);
257
258                                         nix = wrap_periodic(ix+1, width);
259                                         niy = wrap_periodic(iy+1, height);
260
261                                         nnix = wrap_periodic(ix+2, width);
262                                         nniy = wrap_periodic(iy+2, height);
263                                         break;
264                                 case EXTENSION_CLIP:
265                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
266                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
267                                         }
268                                         /* Fall through. */
269                                 case EXTENSION_EXTEND:
270                                         pix = wrap_clamp(ix-1, width);
271                                         piy = wrap_clamp(iy-1, height);
272
273                                         nix = wrap_clamp(ix+1, width);
274                                         niy = wrap_clamp(iy+1, height);
275
276                                         nnix = wrap_clamp(ix+2, width);
277                                         nniy = wrap_clamp(iy+2, height);
278
279                                         ix = wrap_clamp(ix, width);
280                                         iy = wrap_clamp(iy, height);
281                                         break;
282                                 default:
283                                         kernel_assert(0);
284                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
285                         }
286
287                         const int xc[4] = {pix, ix, nix, nnix};
288                         const int yc[4] = {width * piy,
289                                            width * iy,
290                                            width * niy,
291                                            width * nniy};
292                         float u[4], v[4];
293                         /* Some helper macro to keep code reasonable size,
294                          * let compiler to inline all the matrix multiplications.
295                          */
296 #define DATA(x, y) (read(data[xc[x] + yc[y]]))
297 #define TERM(col) \
298                         (v[col] * (u[0] * DATA(0, col) + \
299                                    u[1] * DATA(1, col) + \
300                                    u[2] * DATA(2, col) + \
301                                    u[3] * DATA(3, col)))
302
303                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
304                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
305
306                         /* Actual interpolation. */
307                         return TERM(0) + TERM(1) + TERM(2) + TERM(3);
308
309 #undef TERM
310 #undef DATA
311                 }
312         }
313
314         ccl_always_inline float4 interp_3d(float x, float y, float z)
315         {
316                 return interp_3d_ex(x, y, z, interpolation);
317         }
318
319         ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
320                                               int interpolation = INTERPOLATION_LINEAR)
321         {
322                 if(UNLIKELY(!data))
323                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
324
325                 int ix, iy, iz, nix, niy, niz;
326
327                 if(interpolation == INTERPOLATION_CLOSEST) {
328                         frac(x*(float)width, &ix);
329                         frac(y*(float)height, &iy);
330                         frac(z*(float)depth, &iz);
331
332                         switch(extension) {
333                                 case EXTENSION_REPEAT:
334                                         ix = wrap_periodic(ix, width);
335                                         iy = wrap_periodic(iy, height);
336                                         iz = wrap_periodic(iz, depth);
337                                         break;
338                                 case EXTENSION_CLIP:
339                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
340                                            x > 1.0f || y > 1.0f || z > 1.0f)
341                                         {
342                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
343                                         }
344                                         /* Fall through. */
345                                 case EXTENSION_EXTEND:
346                                         ix = wrap_clamp(ix, width);
347                                         iy = wrap_clamp(iy, height);
348                                         iz = wrap_clamp(iz, depth);
349                                         break;
350                                 default:
351                                         kernel_assert(0);
352                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
353                         }
354
355                         return read(data[ix + iy*width + iz*width*height]);
356                 }
357                 else if(interpolation == INTERPOLATION_LINEAR) {
358                         float tx = frac(x*(float)width - 0.5f, &ix);
359                         float ty = frac(y*(float)height - 0.5f, &iy);
360                         float tz = frac(z*(float)depth - 0.5f, &iz);
361
362                         switch(extension) {
363                                 case EXTENSION_REPEAT:
364                                         ix = wrap_periodic(ix, width);
365                                         iy = wrap_periodic(iy, height);
366                                         iz = wrap_periodic(iz, depth);
367
368                                         nix = wrap_periodic(ix+1, width);
369                                         niy = wrap_periodic(iy+1, height);
370                                         niz = wrap_periodic(iz+1, depth);
371                                         break;
372                                 case EXTENSION_CLIP:
373                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
374                                            x > 1.0f || y > 1.0f || z > 1.0f)
375                                         {
376                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
377                                         }
378                                         /* Fall through. */
379                                 case EXTENSION_EXTEND:
380                                         nix = wrap_clamp(ix+1, width);
381                                         niy = wrap_clamp(iy+1, height);
382                                         niz = wrap_clamp(iz+1, depth);
383
384                                         ix = wrap_clamp(ix, width);
385                                         iy = wrap_clamp(iy, height);
386                                         iz = wrap_clamp(iz, depth);
387                                         break;
388                                 default:
389                                         kernel_assert(0);
390                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
391                         }
392
393                         float4 r;
394
395                         r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
396                         r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
397                         r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
398                         r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
399
400                         r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
401                         r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
402                         r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
403                         r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
404
405                         return r;
406                 }
407                 else {
408                         /* Tricubic b-spline interpolation. */
409                         const float tx = frac(x*(float)width - 0.5f, &ix);
410                         const float ty = frac(y*(float)height - 0.5f, &iy);
411                         const float tz = frac(z*(float)depth - 0.5f, &iz);
412                         int pix, piy, piz, nnix, nniy, nniz;
413
414                         switch(extension) {
415                                 case EXTENSION_REPEAT:
416                                         ix = wrap_periodic(ix, width);
417                                         iy = wrap_periodic(iy, height);
418                                         iz = wrap_periodic(iz, depth);
419
420                                         pix = wrap_periodic(ix-1, width);
421                                         piy = wrap_periodic(iy-1, height);
422                                         piz = wrap_periodic(iz-1, depth);
423
424                                         nix = wrap_periodic(ix+1, width);
425                                         niy = wrap_periodic(iy+1, height);
426                                         niz = wrap_periodic(iz+1, depth);
427
428                                         nnix = wrap_periodic(ix+2, width);
429                                         nniy = wrap_periodic(iy+2, height);
430                                         nniz = wrap_periodic(iz+2, depth);
431                                         break;
432                                 case EXTENSION_CLIP:
433                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
434                                            x > 1.0f || y > 1.0f || z > 1.0f)
435                                         {
436                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
437                                         }
438                                         /* Fall through. */
439                                 case EXTENSION_EXTEND:
440                                         pix = wrap_clamp(ix-1, width);
441                                         piy = wrap_clamp(iy-1, height);
442                                         piz = wrap_clamp(iz-1, depth);
443
444                                         nix = wrap_clamp(ix+1, width);
445                                         niy = wrap_clamp(iy+1, height);
446                                         niz = wrap_clamp(iz+1, depth);
447
448                                         nnix = wrap_clamp(ix+2, width);
449                                         nniy = wrap_clamp(iy+2, height);
450                                         nniz = wrap_clamp(iz+2, depth);
451
452                                         ix = wrap_clamp(ix, width);
453                                         iy = wrap_clamp(iy, height);
454                                         iz = wrap_clamp(iz, depth);
455                                         break;
456                                 default:
457                                         kernel_assert(0);
458                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
459                         }
460
461                         const int xc[4] = {pix, ix, nix, nnix};
462                         const int yc[4] = {width * piy,
463                                            width * iy,
464                                            width * niy,
465                                            width * nniy};
466                         const int zc[4] = {width * height * piz,
467                                            width * height * iz,
468                                            width * height * niz,
469                                            width * height * nniz};
470                         float u[4], v[4], w[4];
471
472                         /* Some helper macro to keep code reasonable size,
473                          * let compiler to inline all the matrix multiplications.
474                          */
475 #define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
476 #define COL_TERM(col, row) \
477                         (v[col] * (u[0] * DATA(0, col, row) + \
478                                    u[1] * DATA(1, col, row) + \
479                                    u[2] * DATA(2, col, row) + \
480                                    u[3] * DATA(3, col, row)))
481 #define ROW_TERM(row) \
482                         (w[row] * (COL_TERM(0, row) + \
483                                    COL_TERM(1, row) + \
484                                    COL_TERM(2, row) + \
485                                    COL_TERM(3, row)))
486
487                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
488                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
489                         SET_CUBIC_SPLINE_WEIGHTS(w, tz);
490
491                         /* Actual interpolation. */
492                         return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
493
494 #undef COL_TERM
495 #undef ROW_TERM
496 #undef DATA
497                 }
498         }
499
500         ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
501         {
502                 width = width_;
503                 height = height_;
504                 depth = depth_;
505         }
506
507         T *data;
508         int interpolation;
509         ExtensionType extension;
510         int width, height, depth;
511 #undef SET_CUBIC_SPLINE_WEIGHTS
512 };
513
514 typedef texture<float4> texture_float4;
515 typedef texture<float2> texture_float2;
516 typedef texture<float> texture_float;
517 typedef texture<uint> texture_uint;
518 typedef texture<int> texture_int;
519 typedef texture<uint4> texture_uint4;
520 typedef texture<uchar4> texture_uchar4;
521 typedef texture<uchar> texture_uchar;
522 typedef texture_image<float> texture_image_float;
523 typedef texture_image<uchar> texture_image_uchar;
524 typedef texture_image<half> texture_image_half;
525 typedef texture_image<float4> texture_image_float4;
526 typedef texture_image<uchar4> texture_image_uchar4;
527 typedef texture_image<half4> texture_image_half4;
528
529 /* Macros to handle different memory storage on different devices */
530
531 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
532 #define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
533 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
534 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
535 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
536
537 #define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y)
538 #define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z)
539 #define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation)
540
541 #define kernel_data (kg->__data)
542
543 #ifdef __KERNEL_SSE2__
544 typedef vector3<sseb> sse3b;
545 typedef vector3<ssef> sse3f;
546 typedef vector3<ssei> sse3i;
547
548 ccl_device_inline void print_sse3b(const char *label, sse3b& a)
549 {
550         print_sseb(label, a.x);
551         print_sseb(label, a.y);
552         print_sseb(label, a.z);
553 }
554
555 ccl_device_inline void print_sse3f(const char *label, sse3f& a)
556 {
557         print_ssef(label, a.x);
558         print_ssef(label, a.y);
559         print_ssef(label, a.z);
560 }
561
562 ccl_device_inline void print_sse3i(const char *label, sse3i& a)
563 {
564         print_ssei(label, a.x);
565         print_ssei(label, a.y);
566         print_ssei(label, a.z);
567 }
568
569 #endif
570
571 CCL_NAMESPACE_END
572
573 #endif /* __KERNEL_COMPAT_CPU_H__ */
574