7595e74e2d569873541d23eefe2a87b714a14ace
[blender.git] / intern / cycles / kernel / kernel_compat_cpu.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifndef __KERNEL_COMPAT_CPU_H__
18 #define __KERNEL_COMPAT_CPU_H__
19
20 #define __KERNEL_CPU__
21
22 /* Release kernel has too much false-positive maybe-uninitialized warnings,
23  * which makes it possible to miss actual warnings.
24  */
25 #if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
26 #  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
27 #  pragma GCC diagnostic ignored "-Wuninitialized"
28 #endif
29
30 /* Selective nodes compilation. */
31 #ifndef __NODES_MAX_GROUP__
32 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
33 #endif
34 #ifndef __NODES_FEATURES__
35 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
36 #endif
37
38 #include "util/util_debug.h"
39 #include "util/util_math.h"
40 #include "util/util_simd.h"
41 #include "util/util_half.h"
42 #include "util/util_types.h"
43 #include "util/util_texture.h"
44
45 #define ccl_restrict_ptr const * __restrict
46
47 #define ccl_addr_space
48
49 #define ccl_local_id(d) 0
50 #define ccl_global_id(d) (kg->global_id[d])
51
52 #define ccl_local_size(d) 1
53 #define ccl_global_size(d) (kg->global_size[d])
54
55 #define ccl_group_id(d) ccl_global_id(d)
56 #define ccl_num_groups(d) ccl_global_size(d)
57
58 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
59  * much slower than the double version.  This was fixed in glibc 2.16.
60  */
61 #if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
62      defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
63      (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
64 #  define expf(x) ((float)exp((double)(x)))
65 #endif
66
67 CCL_NAMESPACE_BEGIN
68
69 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
70  * a macro which is empty for other devices */
71
72 #define kernel_assert(cond) assert(cond)
73
74 /* Texture types to be compatible with CUDA textures. These are really just
75  * simple arrays and after inlining fetch hopefully revert to being a simple
76  * pointer lookup. */
77
78 template<typename T> struct texture  {
79         ccl_always_inline T fetch(int index)
80         {
81                 kernel_assert(index >= 0 && index < width);
82                 return data[index];
83         }
84
85 #ifdef __KERNEL_AVX__
86         /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
87          * compatibility with existing indicies and data structures.
88          */
89         ccl_always_inline avxf fetch_avxf(const int index)
90         {
91                 kernel_assert(index >= 0 && (index+1) < width);
92                 ssef *ssef_data = (ssef*)data;
93                 ssef *ssef_node_data = &ssef_data[index];
94                 return _mm256_loadu_ps((float *)ssef_node_data);
95         }
96
97 #endif
98
99 #ifdef __KERNEL_SSE2__
100         ccl_always_inline ssef fetch_ssef(int index)
101         {
102                 kernel_assert(index >= 0 && index < width);
103                 return ((ssef*)data)[index];
104         }
105
106         ccl_always_inline ssei fetch_ssei(int index)
107         {
108                 kernel_assert(index >= 0 && index < width);
109                 return ((ssei*)data)[index];
110         }
111 #endif
112
113         T *data;
114         int width;
115 };
116
117 template<typename T> struct texture_image  {
118 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
119         { \
120                 u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
121                 u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
122                 u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
123                 u[3] = (1.0f / 6.0f) * t * t * t; \
124         } (void)0
125
126         ccl_always_inline float4 read(float4 r)
127         {
128                 return r;
129         }
130
131         ccl_always_inline float4 read(uchar4 r)
132         {
133                 float f = 1.0f/255.0f;
134                 return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
135         }
136
137         ccl_always_inline float4 read(uchar r)
138         {
139                 float f = r*(1.0f/255.0f);
140                 return make_float4(f, f, f, 1.0f);
141         }
142
143         ccl_always_inline float4 read(float r)
144         {
145                 /* TODO(dingto): Optimize this, so interpolation
146                  * happens on float instead of float4 */
147                 return make_float4(r, r, r, 1.0f);
148         }
149
150         ccl_always_inline float4 read(half4 r)
151         {
152                 return half4_to_float4(r);
153         }
154
155         ccl_always_inline float4 read(half r)
156         {
157                 float f = half_to_float(r);
158                 return make_float4(f, f, f, 1.0f);
159         }
160
161         ccl_always_inline int wrap_periodic(int x, int width)
162         {
163                 x %= width;
164                 if(x < 0)
165                         x += width;
166                 return x;
167         }
168
169         ccl_always_inline int wrap_clamp(int x, int width)
170         {
171                 return clamp(x, 0, width-1);
172         }
173
174         ccl_always_inline float frac(float x, int *ix)
175         {
176                 int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
177                 *ix = i;
178                 return x - (float)i;
179         }
180
181         ccl_always_inline float4 interp(float x, float y)
182         {
183                 if(UNLIKELY(!data))
184                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
185
186                 int ix, iy, nix, niy;
187
188                 if(interpolation == INTERPOLATION_CLOSEST) {
189                         frac(x*(float)width, &ix);
190                         frac(y*(float)height, &iy);
191                         switch(extension) {
192                                 case EXTENSION_REPEAT:
193                                         ix = wrap_periodic(ix, width);
194                                         iy = wrap_periodic(iy, height);
195                                         break;
196                                 case EXTENSION_CLIP:
197                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
198                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
199                                         }
200                                         /* Fall through. */
201                                 case EXTENSION_EXTEND:
202                                         ix = wrap_clamp(ix, width);
203                                         iy = wrap_clamp(iy, height);
204                                         break;
205                                 default:
206                                         kernel_assert(0);
207                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
208                         }
209                         return read(data[ix + iy*width]);
210                 }
211                 else if(interpolation == INTERPOLATION_LINEAR) {
212                         float tx = frac(x*(float)width - 0.5f, &ix);
213                         float ty = frac(y*(float)height - 0.5f, &iy);
214
215                         switch(extension) {
216                                 case EXTENSION_REPEAT:
217                                         ix = wrap_periodic(ix, width);
218                                         iy = wrap_periodic(iy, height);
219
220                                         nix = wrap_periodic(ix+1, width);
221                                         niy = wrap_periodic(iy+1, height);
222                                         break;
223                                 case EXTENSION_CLIP:
224                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
225                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
226                                         }
227                                         /* Fall through. */
228                                 case EXTENSION_EXTEND:
229                                         nix = wrap_clamp(ix+1, width);
230                                         niy = wrap_clamp(iy+1, height);
231
232                                         ix = wrap_clamp(ix, width);
233                                         iy = wrap_clamp(iy, height);
234                                         break;
235                                 default:
236                                         kernel_assert(0);
237                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
238                         }
239
240                         float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
241                         r += (1.0f - ty)*tx*read(data[nix + iy*width]);
242                         r += ty*(1.0f - tx)*read(data[ix + niy*width]);
243                         r += ty*tx*read(data[nix + niy*width]);
244
245                         return r;
246                 }
247                 else {
248                         /* Bicubic b-spline interpolation. */
249                         float tx = frac(x*(float)width - 0.5f, &ix);
250                         float ty = frac(y*(float)height - 0.5f, &iy);
251                         int pix, piy, nnix, nniy;
252                         switch(extension) {
253                                 case EXTENSION_REPEAT:
254                                         ix = wrap_periodic(ix, width);
255                                         iy = wrap_periodic(iy, height);
256
257                                         pix = wrap_periodic(ix-1, width);
258                                         piy = wrap_periodic(iy-1, height);
259
260                                         nix = wrap_periodic(ix+1, width);
261                                         niy = wrap_periodic(iy+1, height);
262
263                                         nnix = wrap_periodic(ix+2, width);
264                                         nniy = wrap_periodic(iy+2, height);
265                                         break;
266                                 case EXTENSION_CLIP:
267                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
268                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
269                                         }
270                                         /* Fall through. */
271                                 case EXTENSION_EXTEND:
272                                         pix = wrap_clamp(ix-1, width);
273                                         piy = wrap_clamp(iy-1, height);
274
275                                         nix = wrap_clamp(ix+1, width);
276                                         niy = wrap_clamp(iy+1, height);
277
278                                         nnix = wrap_clamp(ix+2, width);
279                                         nniy = wrap_clamp(iy+2, height);
280
281                                         ix = wrap_clamp(ix, width);
282                                         iy = wrap_clamp(iy, height);
283                                         break;
284                                 default:
285                                         kernel_assert(0);
286                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
287                         }
288
289                         const int xc[4] = {pix, ix, nix, nnix};
290                         const int yc[4] = {width * piy,
291                                            width * iy,
292                                            width * niy,
293                                            width * nniy};
294                         float u[4], v[4];
295                         /* Some helper macro to keep code reasonable size,
296                          * let compiler to inline all the matrix multiplications.
297                          */
298 #define DATA(x, y) (read(data[xc[x] + yc[y]]))
299 #define TERM(col) \
300                         (v[col] * (u[0] * DATA(0, col) + \
301                                    u[1] * DATA(1, col) + \
302                                    u[2] * DATA(2, col) + \
303                                    u[3] * DATA(3, col)))
304
305                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
306                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
307
308                         /* Actual interpolation. */
309                         return TERM(0) + TERM(1) + TERM(2) + TERM(3);
310
311 #undef TERM
312 #undef DATA
313                 }
314         }
315
316         ccl_always_inline float4 interp_3d(float x, float y, float z)
317         {
318                 return interp_3d_ex(x, y, z, interpolation);
319         }
320
321         ccl_always_inline float4 interp_3d_ex_closest(float x, float y, float z)
322         {
323                 int ix, iy, iz;
324                 frac(x*(float)width, &ix);
325                 frac(y*(float)height, &iy);
326                 frac(z*(float)depth, &iz);
327
328                 switch(extension) {
329                         case EXTENSION_REPEAT:
330                                 ix = wrap_periodic(ix, width);
331                                 iy = wrap_periodic(iy, height);
332                                 iz = wrap_periodic(iz, depth);
333                                 break;
334                         case EXTENSION_CLIP:
335                                 if(x < 0.0f || y < 0.0f || z < 0.0f ||
336                                    x > 1.0f || y > 1.0f || z > 1.0f)
337                                 {
338                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
339                                 }
340                                 /* Fall through. */
341                         case EXTENSION_EXTEND:
342                                 ix = wrap_clamp(ix, width);
343                                 iy = wrap_clamp(iy, height);
344                                 iz = wrap_clamp(iz, depth);
345                                 break;
346                         default:
347                                 kernel_assert(0);
348                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
349                 }
350
351                 return read(data[ix + iy*width + iz*width*height]);
352         }
353
354         ccl_always_inline float4 interp_3d_ex_linear(float x, float y, float z)
355         {
356                 int ix, iy, iz;
357                 int nix, niy, niz;
358
359                 float tx = frac(x*(float)width - 0.5f, &ix);
360                 float ty = frac(y*(float)height - 0.5f, &iy);
361                 float tz = frac(z*(float)depth - 0.5f, &iz);
362
363                 switch(extension) {
364                         case EXTENSION_REPEAT:
365                                 ix = wrap_periodic(ix, width);
366                                 iy = wrap_periodic(iy, height);
367                                 iz = wrap_periodic(iz, depth);
368
369                                 nix = wrap_periodic(ix+1, width);
370                                 niy = wrap_periodic(iy+1, height);
371                                 niz = wrap_periodic(iz+1, depth);
372                                 break;
373                         case EXTENSION_CLIP:
374                                 if(x < 0.0f || y < 0.0f || z < 0.0f ||
375                                    x > 1.0f || y > 1.0f || z > 1.0f)
376                                 {
377                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
378                                 }
379                                 /* Fall through. */
380                         case EXTENSION_EXTEND:
381                                 nix = wrap_clamp(ix+1, width);
382                                 niy = wrap_clamp(iy+1, height);
383                                 niz = wrap_clamp(iz+1, depth);
384
385                                 ix = wrap_clamp(ix, width);
386                                 iy = wrap_clamp(iy, height);
387                                 iz = wrap_clamp(iz, depth);
388                                 break;
389                         default:
390                                 kernel_assert(0);
391                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
392                 }
393
394                 float4 r;
395
396                 r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
397                 r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
398                 r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
399                 r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
400
401                 r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
402                 r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
403                 r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
404                 r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
405
406                 return r;
407         }
408
409         /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
410          * causing stack overflow issue in this function unless it is inlined.
411          *
412          * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
413          * enabled.
414          */
415 #ifdef __GNUC__
416         ccl_always_inline
417 #else
418         ccl_never_inline
419 #endif
420         float4 interp_3d_ex_tricubic(float x, float y, float z)
421         {
422                 int ix, iy, iz;
423                 int nix, niy, niz;
424                 /* Tricubic b-spline interpolation. */
425                 const float tx = frac(x*(float)width - 0.5f, &ix);
426                 const float ty = frac(y*(float)height - 0.5f, &iy);
427                 const float tz = frac(z*(float)depth - 0.5f, &iz);
428                 int pix, piy, piz, nnix, nniy, nniz;
429
430                 switch(extension) {
431                         case EXTENSION_REPEAT:
432                                 ix = wrap_periodic(ix, width);
433                                 iy = wrap_periodic(iy, height);
434                                 iz = wrap_periodic(iz, depth);
435
436                                 pix = wrap_periodic(ix-1, width);
437                                 piy = wrap_periodic(iy-1, height);
438                                 piz = wrap_periodic(iz-1, depth);
439
440                                 nix = wrap_periodic(ix+1, width);
441                                 niy = wrap_periodic(iy+1, height);
442                                 niz = wrap_periodic(iz+1, depth);
443
444                                 nnix = wrap_periodic(ix+2, width);
445                                 nniy = wrap_periodic(iy+2, height);
446                                 nniz = wrap_periodic(iz+2, depth);
447                                 break;
448                         case EXTENSION_CLIP:
449                                 if(x < 0.0f || y < 0.0f || z < 0.0f ||
450                                    x > 1.0f || y > 1.0f || z > 1.0f)
451                                 {
452                                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
453                                 }
454                                 /* Fall through. */
455                         case EXTENSION_EXTEND:
456                                 pix = wrap_clamp(ix-1, width);
457                                 piy = wrap_clamp(iy-1, height);
458                                 piz = wrap_clamp(iz-1, depth);
459
460                                 nix = wrap_clamp(ix+1, width);
461                                 niy = wrap_clamp(iy+1, height);
462                                 niz = wrap_clamp(iz+1, depth);
463
464                                 nnix = wrap_clamp(ix+2, width);
465                                 nniy = wrap_clamp(iy+2, height);
466                                 nniz = wrap_clamp(iz+2, depth);
467
468                                 ix = wrap_clamp(ix, width);
469                                 iy = wrap_clamp(iy, height);
470                                 iz = wrap_clamp(iz, depth);
471                                 break;
472                         default:
473                                 kernel_assert(0);
474                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
475                 }
476
477                 const int xc[4] = {pix, ix, nix, nnix};
478                 const int yc[4] = {width * piy,
479                                    width * iy,
480                                    width * niy,
481                                    width * nniy};
482                 const int zc[4] = {width * height * piz,
483                                    width * height * iz,
484                                    width * height * niz,
485                                    width * height * nniz};
486                 float u[4], v[4], w[4];
487
488                 /* Some helper macro to keep code reasonable size,
489                  * let compiler to inline all the matrix multiplications.
490                  */
491 #define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
492 #define COL_TERM(col, row) \
493                 (v[col] * (u[0] * DATA(0, col, row) + \
494                            u[1] * DATA(1, col, row) + \
495                            u[2] * DATA(2, col, row) + \
496                            u[3] * DATA(3, col, row)))
497 #define ROW_TERM(row) \
498                 (w[row] * (COL_TERM(0, row) + \
499                            COL_TERM(1, row) + \
500                            COL_TERM(2, row) + \
501                            COL_TERM(3, row)))
502
503                 SET_CUBIC_SPLINE_WEIGHTS(u, tx);
504                 SET_CUBIC_SPLINE_WEIGHTS(v, ty);
505                 SET_CUBIC_SPLINE_WEIGHTS(w, tz);
506
507                 /* Actual interpolation. */
508                 return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
509
510 #undef COL_TERM
511 #undef ROW_TERM
512 #undef DATA
513         }
514
515         ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
516                                               int interpolation = INTERPOLATION_LINEAR)
517         {
518                 if(UNLIKELY(!data))
519                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
520
521                 switch(interpolation) {
522                         case INTERPOLATION_CLOSEST:
523                                 return interp_3d_ex_closest(x, y, z);
524                         case INTERPOLATION_LINEAR:
525                                 return interp_3d_ex_linear(x, y, z);
526                         default:
527                                 return interp_3d_ex_tricubic(x, y, z);
528                 }
529         }
530
531         ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
532         {
533                 width = width_;
534                 height = height_;
535                 depth = depth_;
536         }
537
538         T *data;
539         int interpolation;
540         ExtensionType extension;
541         int width, height, depth;
542 #undef SET_CUBIC_SPLINE_WEIGHTS
543 };
544
545 typedef texture<float4> texture_float4;
546 typedef texture<float2> texture_float2;
547 typedef texture<float> texture_float;
548 typedef texture<uint> texture_uint;
549 typedef texture<int> texture_int;
550 typedef texture<uint4> texture_uint4;
551 typedef texture<uchar4> texture_uchar4;
552 typedef texture<uchar> texture_uchar;
553 typedef texture_image<float> texture_image_float;
554 typedef texture_image<uchar> texture_image_uchar;
555 typedef texture_image<half> texture_image_half;
556 typedef texture_image<float4> texture_image_float4;
557 typedef texture_image<uchar4> texture_image_uchar4;
558 typedef texture_image<half4> texture_image_half4;
559
560 /* Macros to handle different memory storage on different devices */
561
562 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
563 #define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
564 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
565 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
566 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
567
568 #define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y)
569 #define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z)
570 #define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation)
571
572 #define kernel_data (kg->__data)
573
574 #ifdef __KERNEL_SSE2__
575 typedef vector3<sseb> sse3b;
576 typedef vector3<ssef> sse3f;
577 typedef vector3<ssei> sse3i;
578
579 ccl_device_inline void print_sse3b(const char *label, sse3b& a)
580 {
581         print_sseb(label, a.x);
582         print_sseb(label, a.y);
583         print_sseb(label, a.z);
584 }
585
586 ccl_device_inline void print_sse3f(const char *label, sse3f& a)
587 {
588         print_ssef(label, a.x);
589         print_ssef(label, a.y);
590         print_ssef(label, a.z);
591 }
592
593 ccl_device_inline void print_sse3i(const char *label, sse3i& a)
594 {
595         print_ssei(label, a.x);
596         print_ssei(label, a.y);
597         print_ssei(label, a.z);
598 }
599
600 #endif
601
602 CCL_NAMESPACE_END
603
604 #endif /* __KERNEL_COMPAT_CPU_H__ */
605