2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef __KERNEL_COMPAT_CPU_H__
18 #define __KERNEL_COMPAT_CPU_H__
20 #define __KERNEL_CPU__
22 /* Release kernel has too much false-positive maybe-uninitialized warnings,
23 * which makes it possible to miss actual warnings.
25 #if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
26 # pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
27 # pragma GCC diagnostic ignored "-Wuninitialized"
30 /* Selective nodes compilation. */
31 #ifndef __NODES_MAX_GROUP__
32 # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
34 #ifndef __NODES_FEATURES__
35 # define __NODES_FEATURES__ NODE_FEATURE_ALL
38 #include "util/util_debug.h"
39 #include "util/util_math.h"
40 #include "util/util_simd.h"
41 #include "util/util_half.h"
42 #include "util/util_types.h"
43 #include "util/util_texture.h"
45 #define ccl_restrict_ptr const * __restrict
47 #define ccl_addr_space
49 #define ccl_local_id(d) 0
50 #define ccl_global_id(d) (kg->global_id[d])
52 #define ccl_local_size(d) 1
53 #define ccl_global_size(d) (kg->global_size[d])
55 #define ccl_group_id(d) ccl_global_id(d)
56 #define ccl_num_groups(d) ccl_global_size(d)
58 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
59 * much slower than the double version. This was fixed in glibc 2.16.
61 #if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \
62 defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
63 (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
64 # define expf(x) ((float)exp((double)(x)))
69 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
70 * a macro which is empty for other devices */
72 #define kernel_assert(cond) assert(cond)
74 /* Texture types to be compatible with CUDA textures. These are really just
75 * simple arrays and after inlining fetch hopefully revert to being a simple
78 template<typename T> struct texture {
79 ccl_always_inline T fetch(int index)
81 kernel_assert(index >= 0 && index < width);
86 /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
87 * compatibility with existing indicies and data structures.
89 ccl_always_inline avxf fetch_avxf(const int index)
91 kernel_assert(index >= 0 && (index+1) < width);
92 ssef *ssef_data = (ssef*)data;
93 ssef *ssef_node_data = &ssef_data[index];
94 return _mm256_loadu_ps((float *)ssef_node_data);
99 #ifdef __KERNEL_SSE2__
100 ccl_always_inline ssef fetch_ssef(int index)
102 kernel_assert(index >= 0 && index < width);
103 return ((ssef*)data)[index];
106 ccl_always_inline ssei fetch_ssei(int index)
108 kernel_assert(index >= 0 && index < width);
109 return ((ssei*)data)[index];
117 template<typename T> struct texture_image {
118 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
120 u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
121 u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \
122 u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
123 u[3] = (1.0f / 6.0f) * t * t * t; \
126 ccl_always_inline float4 read(float4 r)
131 ccl_always_inline float4 read(uchar4 r)
133 float f = 1.0f/255.0f;
134 return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
137 ccl_always_inline float4 read(uchar r)
139 float f = r*(1.0f/255.0f);
140 return make_float4(f, f, f, 1.0f);
143 ccl_always_inline float4 read(float r)
145 /* TODO(dingto): Optimize this, so interpolation
146 * happens on float instead of float4 */
147 return make_float4(r, r, r, 1.0f);
150 ccl_always_inline float4 read(half4 r)
152 return half4_to_float4(r);
155 ccl_always_inline float4 read(half r)
157 float f = half_to_float(r);
158 return make_float4(f, f, f, 1.0f);
161 ccl_always_inline int wrap_periodic(int x, int width)
169 ccl_always_inline int wrap_clamp(int x, int width)
171 return clamp(x, 0, width-1);
174 ccl_always_inline float frac(float x, int *ix)
176 int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
181 ccl_always_inline float4 interp(float x, float y)
184 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
186 int ix, iy, nix, niy;
188 if(interpolation == INTERPOLATION_CLOSEST) {
189 frac(x*(float)width, &ix);
190 frac(y*(float)height, &iy);
192 case EXTENSION_REPEAT:
193 ix = wrap_periodic(ix, width);
194 iy = wrap_periodic(iy, height);
197 if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
198 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
201 case EXTENSION_EXTEND:
202 ix = wrap_clamp(ix, width);
203 iy = wrap_clamp(iy, height);
207 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
209 return read(data[ix + iy*width]);
211 else if(interpolation == INTERPOLATION_LINEAR) {
212 float tx = frac(x*(float)width - 0.5f, &ix);
213 float ty = frac(y*(float)height - 0.5f, &iy);
216 case EXTENSION_REPEAT:
217 ix = wrap_periodic(ix, width);
218 iy = wrap_periodic(iy, height);
220 nix = wrap_periodic(ix+1, width);
221 niy = wrap_periodic(iy+1, height);
224 if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
225 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
228 case EXTENSION_EXTEND:
229 nix = wrap_clamp(ix+1, width);
230 niy = wrap_clamp(iy+1, height);
232 ix = wrap_clamp(ix, width);
233 iy = wrap_clamp(iy, height);
237 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
240 float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
241 r += (1.0f - ty)*tx*read(data[nix + iy*width]);
242 r += ty*(1.0f - tx)*read(data[ix + niy*width]);
243 r += ty*tx*read(data[nix + niy*width]);
248 /* Bicubic b-spline interpolation. */
249 float tx = frac(x*(float)width - 0.5f, &ix);
250 float ty = frac(y*(float)height - 0.5f, &iy);
251 int pix, piy, nnix, nniy;
253 case EXTENSION_REPEAT:
254 ix = wrap_periodic(ix, width);
255 iy = wrap_periodic(iy, height);
257 pix = wrap_periodic(ix-1, width);
258 piy = wrap_periodic(iy-1, height);
260 nix = wrap_periodic(ix+1, width);
261 niy = wrap_periodic(iy+1, height);
263 nnix = wrap_periodic(ix+2, width);
264 nniy = wrap_periodic(iy+2, height);
267 if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
268 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
271 case EXTENSION_EXTEND:
272 pix = wrap_clamp(ix-1, width);
273 piy = wrap_clamp(iy-1, height);
275 nix = wrap_clamp(ix+1, width);
276 niy = wrap_clamp(iy+1, height);
278 nnix = wrap_clamp(ix+2, width);
279 nniy = wrap_clamp(iy+2, height);
281 ix = wrap_clamp(ix, width);
282 iy = wrap_clamp(iy, height);
286 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
289 const int xc[4] = {pix, ix, nix, nnix};
290 const int yc[4] = {width * piy,
295 /* Some helper macro to keep code reasonable size,
296 * let compiler to inline all the matrix multiplications.
298 #define DATA(x, y) (read(data[xc[x] + yc[y]]))
300 (v[col] * (u[0] * DATA(0, col) + \
301 u[1] * DATA(1, col) + \
302 u[2] * DATA(2, col) + \
303 u[3] * DATA(3, col)))
305 SET_CUBIC_SPLINE_WEIGHTS(u, tx);
306 SET_CUBIC_SPLINE_WEIGHTS(v, ty);
308 /* Actual interpolation. */
309 return TERM(0) + TERM(1) + TERM(2) + TERM(3);
316 ccl_always_inline float4 interp_3d(float x, float y, float z)
318 return interp_3d_ex(x, y, z, interpolation);
321 ccl_always_inline float4 interp_3d_ex_closest(float x, float y, float z)
324 frac(x*(float)width, &ix);
325 frac(y*(float)height, &iy);
326 frac(z*(float)depth, &iz);
329 case EXTENSION_REPEAT:
330 ix = wrap_periodic(ix, width);
331 iy = wrap_periodic(iy, height);
332 iz = wrap_periodic(iz, depth);
335 if(x < 0.0f || y < 0.0f || z < 0.0f ||
336 x > 1.0f || y > 1.0f || z > 1.0f)
338 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
341 case EXTENSION_EXTEND:
342 ix = wrap_clamp(ix, width);
343 iy = wrap_clamp(iy, height);
344 iz = wrap_clamp(iz, depth);
348 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
351 return read(data[ix + iy*width + iz*width*height]);
354 ccl_always_inline float4 interp_3d_ex_linear(float x, float y, float z)
359 float tx = frac(x*(float)width - 0.5f, &ix);
360 float ty = frac(y*(float)height - 0.5f, &iy);
361 float tz = frac(z*(float)depth - 0.5f, &iz);
364 case EXTENSION_REPEAT:
365 ix = wrap_periodic(ix, width);
366 iy = wrap_periodic(iy, height);
367 iz = wrap_periodic(iz, depth);
369 nix = wrap_periodic(ix+1, width);
370 niy = wrap_periodic(iy+1, height);
371 niz = wrap_periodic(iz+1, depth);
374 if(x < 0.0f || y < 0.0f || z < 0.0f ||
375 x > 1.0f || y > 1.0f || z > 1.0f)
377 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
380 case EXTENSION_EXTEND:
381 nix = wrap_clamp(ix+1, width);
382 niy = wrap_clamp(iy+1, height);
383 niz = wrap_clamp(iz+1, depth);
385 ix = wrap_clamp(ix, width);
386 iy = wrap_clamp(iy, height);
387 iz = wrap_clamp(iz, depth);
391 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
396 r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
397 r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
398 r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
399 r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
401 r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
402 r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
403 r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
404 r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
409 /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
410 * causing stack overflow issue in this function unless it is inlined.
412 * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
420 float4 interp_3d_ex_tricubic(float x, float y, float z)
424 /* Tricubic b-spline interpolation. */
425 const float tx = frac(x*(float)width - 0.5f, &ix);
426 const float ty = frac(y*(float)height - 0.5f, &iy);
427 const float tz = frac(z*(float)depth - 0.5f, &iz);
428 int pix, piy, piz, nnix, nniy, nniz;
431 case EXTENSION_REPEAT:
432 ix = wrap_periodic(ix, width);
433 iy = wrap_periodic(iy, height);
434 iz = wrap_periodic(iz, depth);
436 pix = wrap_periodic(ix-1, width);
437 piy = wrap_periodic(iy-1, height);
438 piz = wrap_periodic(iz-1, depth);
440 nix = wrap_periodic(ix+1, width);
441 niy = wrap_periodic(iy+1, height);
442 niz = wrap_periodic(iz+1, depth);
444 nnix = wrap_periodic(ix+2, width);
445 nniy = wrap_periodic(iy+2, height);
446 nniz = wrap_periodic(iz+2, depth);
449 if(x < 0.0f || y < 0.0f || z < 0.0f ||
450 x > 1.0f || y > 1.0f || z > 1.0f)
452 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
455 case EXTENSION_EXTEND:
456 pix = wrap_clamp(ix-1, width);
457 piy = wrap_clamp(iy-1, height);
458 piz = wrap_clamp(iz-1, depth);
460 nix = wrap_clamp(ix+1, width);
461 niy = wrap_clamp(iy+1, height);
462 niz = wrap_clamp(iz+1, depth);
464 nnix = wrap_clamp(ix+2, width);
465 nniy = wrap_clamp(iy+2, height);
466 nniz = wrap_clamp(iz+2, depth);
468 ix = wrap_clamp(ix, width);
469 iy = wrap_clamp(iy, height);
470 iz = wrap_clamp(iz, depth);
474 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
477 const int xc[4] = {pix, ix, nix, nnix};
478 const int yc[4] = {width * piy,
482 const int zc[4] = {width * height * piz,
484 width * height * niz,
485 width * height * nniz};
486 float u[4], v[4], w[4];
488 /* Some helper macro to keep code reasonable size,
489 * let compiler to inline all the matrix multiplications.
491 #define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
492 #define COL_TERM(col, row) \
493 (v[col] * (u[0] * DATA(0, col, row) + \
494 u[1] * DATA(1, col, row) + \
495 u[2] * DATA(2, col, row) + \
496 u[3] * DATA(3, col, row)))
497 #define ROW_TERM(row) \
498 (w[row] * (COL_TERM(0, row) + \
503 SET_CUBIC_SPLINE_WEIGHTS(u, tx);
504 SET_CUBIC_SPLINE_WEIGHTS(v, ty);
505 SET_CUBIC_SPLINE_WEIGHTS(w, tz);
507 /* Actual interpolation. */
508 return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
515 ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
516 int interpolation = INTERPOLATION_LINEAR)
519 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
521 switch(interpolation) {
522 case INTERPOLATION_CLOSEST:
523 return interp_3d_ex_closest(x, y, z);
524 case INTERPOLATION_LINEAR:
525 return interp_3d_ex_linear(x, y, z);
527 return interp_3d_ex_tricubic(x, y, z);
531 ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
540 ExtensionType extension;
541 int width, height, depth;
542 #undef SET_CUBIC_SPLINE_WEIGHTS
545 typedef texture<float4> texture_float4;
546 typedef texture<float2> texture_float2;
547 typedef texture<float> texture_float;
548 typedef texture<uint> texture_uint;
549 typedef texture<int> texture_int;
550 typedef texture<uint4> texture_uint4;
551 typedef texture<uchar4> texture_uchar4;
552 typedef texture<uchar> texture_uchar;
553 typedef texture_image<float> texture_image_float;
554 typedef texture_image<uchar> texture_image_uchar;
555 typedef texture_image<half> texture_image_half;
556 typedef texture_image<float4> texture_image_float4;
557 typedef texture_image<uchar4> texture_image_uchar4;
558 typedef texture_image<half4> texture_image_half4;
560 /* Macros to handle different memory storage on different devices */
562 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
563 #define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
564 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
565 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
566 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
568 #define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y)
569 #define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z)
570 #define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation)
572 #define kernel_data (kg->__data)
574 #ifdef __KERNEL_SSE2__
575 typedef vector3<sseb> sse3b;
576 typedef vector3<ssef> sse3f;
577 typedef vector3<ssei> sse3i;
579 ccl_device_inline void print_sse3b(const char *label, sse3b& a)
581 print_sseb(label, a.x);
582 print_sseb(label, a.y);
583 print_sseb(label, a.z);
586 ccl_device_inline void print_sse3f(const char *label, sse3f& a)
588 print_ssef(label, a.x);
589 print_ssef(label, a.y);
590 print_ssef(label, a.z);
593 ccl_device_inline void print_sse3i(const char *label, sse3i& a)
595 print_ssei(label, a.x);
596 print_ssei(label, a.y);
597 print_ssei(label, a.z);
604 #endif /* __KERNEL_COMPAT_CPU_H__ */