Cycles: Cleanup, whitespace around keywords
[blender.git] / intern / cycles / kernel / kernel_compat_cpu.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifndef __KERNEL_COMPAT_CPU_H__
18 #define __KERNEL_COMPAT_CPU_H__
19
20 #define __KERNEL_CPU__
21
22 /* Release kernel has too much false-positive maybe-uninitialized warnings,
23  * which makes it possible to miss actual warnings.
24  */
25 #if defined(__GNUC__) && defined(NDEBUG)
26 #  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
27 #  pragma GCC diagnostic ignored "-Wuninitialized"
28 #endif
29
30 /* Selective nodes compilation. */
31 #ifndef __NODES_MAX_GROUP__
32 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
33 #endif
34 #ifndef __NODES_FEATURES__
35 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
36 #endif
37
38 #include "util_debug.h"
39 #include "util_math.h"
40 #include "util_simd.h"
41 #include "util_half.h"
42 #include "util_types.h"
43
44 #define ccl_addr_space
45
46 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
47  * much slower than the double version.  This was fixed in glibc 2.16.
48  */
49 #if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
50      defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
51      (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
52 #  define expf(x) ((float)exp((double)(x)))
53 #endif
54
55 CCL_NAMESPACE_BEGIN
56
57 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
58  * a macro which is empty for other devices */
59
60 #define kernel_assert(cond) assert(cond)
61
62 /* Texture types to be compatible with CUDA textures. These are really just
63  * simple arrays and after inlining fetch hopefully revert to being a simple
64  * pointer lookup. */
65
66 template<typename T> struct texture  {
67         ccl_always_inline T fetch(int index)
68         {
69                 kernel_assert(index >= 0 && index < width);
70                 return data[index];
71         }
72
73 #ifdef __KERNEL_SSE2__
74         ccl_always_inline ssef fetch_ssef(int index)
75         {
76                 kernel_assert(index >= 0 && index < width);
77                 return ((ssef*)data)[index];
78         }
79
80         ccl_always_inline ssei fetch_ssei(int index)
81         {
82                 kernel_assert(index >= 0 && index < width);
83                 return ((ssei*)data)[index];
84         }
85 #endif
86
87         T *data;
88         int width;
89 };
90
91 template<typename T> struct texture_image  {
92 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
93         { \
94                 u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
95                 u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
96                 u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
97                 u[3] = (1.0f / 6.0f) * t * t * t; \
98         } (void)0
99
100         ccl_always_inline float4 read(float4 r)
101         {
102                 return r;
103         }
104
105         ccl_always_inline float4 read(uchar4 r)
106         {
107                 float f = 1.0f/255.0f;
108                 return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
109         }
110
111         ccl_always_inline int wrap_periodic(int x, int width)
112         {
113                 x %= width;
114                 if(x < 0)
115                         x += width;
116                 return x;
117         }
118
119         ccl_always_inline int wrap_clamp(int x, int width)
120         {
121                 return clamp(x, 0, width-1);
122         }
123
124         ccl_always_inline float frac(float x, int *ix)
125         {
126                 int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
127                 *ix = i;
128                 return x - (float)i;
129         }
130
131         ccl_always_inline float4 interp(float x, float y)
132         {
133                 if(UNLIKELY(!data))
134                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
135
136                 int ix, iy, nix, niy;
137
138                 if(interpolation == INTERPOLATION_CLOSEST) {
139                         frac(x*(float)width, &ix);
140                         frac(y*(float)height, &iy);
141                         switch(extension) {
142                                 case EXTENSION_REPEAT:
143                                         ix = wrap_periodic(ix, width);
144                                         iy = wrap_periodic(iy, height);
145                                         break;
146                                 case EXTENSION_CLIP:
147                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
148                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
149                                         }
150                                         /* Fall through. */
151                                 case EXTENSION_EXTEND:
152                                         ix = wrap_clamp(ix, width);
153                                         iy = wrap_clamp(iy, height);
154                                         break;
155                         }
156                         return read(data[ix + iy*width]);
157                 }
158                 else if(interpolation == INTERPOLATION_LINEAR) {
159                         float tx = frac(x*(float)width - 0.5f, &ix);
160                         float ty = frac(y*(float)height - 0.5f, &iy);
161
162                         switch(extension) {
163                                 case EXTENSION_REPEAT:
164                                         ix = wrap_periodic(ix, width);
165                                         iy = wrap_periodic(iy, height);
166
167                                         nix = wrap_periodic(ix+1, width);
168                                         niy = wrap_periodic(iy+1, height);
169                                         break;
170                                 case EXTENSION_CLIP:
171                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
172                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
173                                         }
174                                         /* Fall through. */
175                                 case EXTENSION_EXTEND:
176                                         nix = wrap_clamp(ix+1, width);
177                                         niy = wrap_clamp(iy+1, height);
178
179                                         ix = wrap_clamp(ix, width);
180                                         iy = wrap_clamp(iy, height);
181                                         break;
182                         }
183
184                         float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
185                         r += (1.0f - ty)*tx*read(data[nix + iy*width]);
186                         r += ty*(1.0f - tx)*read(data[ix + niy*width]);
187                         r += ty*tx*read(data[nix + niy*width]);
188
189                         return r;
190                 }
191                 else {
192                         /* Bicubic b-spline interpolation. */
193                         float tx = frac(x*(float)width - 0.5f, &ix);
194                         float ty = frac(y*(float)height - 0.5f, &iy);
195                         int pix, piy, nnix, nniy;
196                         switch(extension) {
197                                 case EXTENSION_REPEAT:
198                                         ix = wrap_periodic(ix, width);
199                                         iy = wrap_periodic(iy, height);
200
201                                         pix = wrap_periodic(ix-1, width);
202                                         piy = wrap_periodic(iy-1, height);
203
204                                         nix = wrap_periodic(ix+1, width);
205                                         niy = wrap_periodic(iy+1, height);
206
207                                         nnix = wrap_periodic(ix+2, width);
208                                         nniy = wrap_periodic(iy+2, height);
209                                         break;
210                                 case EXTENSION_CLIP:
211                                         if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
212                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
213                                         }
214                                         /* Fall through. */
215                                 case EXTENSION_EXTEND:
216                                         pix = wrap_clamp(ix-1, width);
217                                         piy = wrap_clamp(iy-1, height);
218
219                                         nix = wrap_clamp(ix+1, width);
220                                         niy = wrap_clamp(iy+1, height);
221
222                                         nnix = wrap_clamp(ix+2, width);
223                                         nniy = wrap_clamp(iy+2, height);
224
225                                         ix = wrap_clamp(ix, width);
226                                         iy = wrap_clamp(iy, height);
227                                         break;
228                         }
229
230                         const int xc[4] = {pix, ix, nix, nnix};
231                         const int yc[4] = {width * piy,
232                                            width * iy,
233                                            width * niy,
234                                            width * nniy};
235                         float u[4], v[4];
236                         /* Some helper macro to keep code reasonable size,
237                          * let compiler to inline all the matrix multiplications.
238                          */
239 #define DATA(x, y) (read(data[xc[x] + yc[y]]))
240 #define TERM(col) \
241                         (v[col] * (u[0] * DATA(0, col) + \
242                                    u[1] * DATA(1, col) + \
243                                    u[2] * DATA(2, col) + \
244                                    u[3] * DATA(3, col)))
245
246                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
247                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
248
249                         /* Actual interpolation. */
250                         return TERM(0) + TERM(1) + TERM(2) + TERM(3);
251
252 #undef TERM
253 #undef DATA
254                 }
255         }
256
257         ccl_always_inline float4 interp_3d(float x, float y, float z)
258         {
259                 return interp_3d_ex(x, y, z, interpolation);
260         }
261
262         ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
263                                               int interpolation = INTERPOLATION_LINEAR)
264         {
265                 if(UNLIKELY(!data))
266                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
267
268                 int ix, iy, iz, nix, niy, niz;
269
270                 if(interpolation == INTERPOLATION_CLOSEST) {
271                         frac(x*(float)width, &ix);
272                         frac(y*(float)height, &iy);
273                         frac(z*(float)depth, &iz);
274
275                         switch(extension) {
276                                 case EXTENSION_REPEAT:
277                                         ix = wrap_periodic(ix, width);
278                                         iy = wrap_periodic(iy, height);
279                                         iz = wrap_periodic(iz, depth);
280                                         break;
281                                 case EXTENSION_CLIP:
282                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
283                                            x > 1.0f || y > 1.0f || z > 1.0f)
284                                         {
285                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
286                                         }
287                                         /* Fall through. */
288                                 case EXTENSION_EXTEND:
289                                         ix = wrap_clamp(ix, width);
290                                         iy = wrap_clamp(iy, height);
291                                         iz = wrap_clamp(iz, depth);
292                                         break;
293                         }
294
295                         return read(data[ix + iy*width + iz*width*height]);
296                 }
297                 else if(interpolation == INTERPOLATION_LINEAR) {
298                         float tx = frac(x*(float)width - 0.5f, &ix);
299                         float ty = frac(y*(float)height - 0.5f, &iy);
300                         float tz = frac(z*(float)depth - 0.5f, &iz);
301
302                         switch(extension) {
303                                 case EXTENSION_REPEAT:
304                                         ix = wrap_periodic(ix, width);
305                                         iy = wrap_periodic(iy, height);
306                                         iz = wrap_periodic(iz, depth);
307
308                                         nix = wrap_periodic(ix+1, width);
309                                         niy = wrap_periodic(iy+1, height);
310                                         niz = wrap_periodic(iz+1, depth);
311                                         break;
312                                 case EXTENSION_CLIP:
313                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
314                                            x > 1.0f || y > 1.0f || z > 1.0f)
315                                         {
316                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
317                                         }
318                                         /* Fall through. */
319                                 case EXTENSION_EXTEND:
320                                         nix = wrap_clamp(ix+1, width);
321                                         niy = wrap_clamp(iy+1, height);
322                                         niz = wrap_clamp(iz+1, depth);
323
324                                         ix = wrap_clamp(ix, width);
325                                         iy = wrap_clamp(iy, height);
326                                         iz = wrap_clamp(iz, depth);
327                                         break;
328                         }
329
330                         float4 r;
331
332                         r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
333                         r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
334                         r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
335                         r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
336
337                         r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
338                         r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
339                         r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
340                         r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
341
342                         return r;
343                 }
344                 else {
345                         /* Tricubic b-spline interpolation. */
346                         const float tx = frac(x*(float)width - 0.5f, &ix);
347                         const float ty = frac(y*(float)height - 0.5f, &iy);
348                         const float tz = frac(z*(float)depth - 0.5f, &iz);
349                         int pix, piy, piz, nnix, nniy, nniz;
350
351                         switch(extension) {
352                                 case EXTENSION_REPEAT:
353                                         ix = wrap_periodic(ix, width);
354                                         iy = wrap_periodic(iy, height);
355                                         iz = wrap_periodic(iz, depth);
356
357                                         pix = wrap_periodic(ix-1, width);
358                                         piy = wrap_periodic(iy-1, height);
359                                         piz = wrap_periodic(iz-1, depth);
360
361                                         nix = wrap_periodic(ix+1, width);
362                                         niy = wrap_periodic(iy+1, height);
363                                         niz = wrap_periodic(iz+1, depth);
364
365                                         nnix = wrap_periodic(ix+2, width);
366                                         nniy = wrap_periodic(iy+2, height);
367                                         nniz = wrap_periodic(iz+2, depth);
368                                         break;
369                                 case EXTENSION_CLIP:
370                                         if(x < 0.0f || y < 0.0f || z < 0.0f ||
371                                            x > 1.0f || y > 1.0f || z > 1.0f)
372                                         {
373                                                 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
374                                         }
375                                         /* Fall through. */
376                                 case EXTENSION_EXTEND:
377                                         pix = wrap_clamp(ix-1, width);
378                                         piy = wrap_clamp(iy-1, height);
379                                         piz = wrap_clamp(iz-1, depth);
380
381                                         nix = wrap_clamp(ix+1, width);
382                                         niy = wrap_clamp(iy+1, height);
383                                         niz = wrap_clamp(iz+1, depth);
384
385                                         nnix = wrap_clamp(ix+2, width);
386                                         nniy = wrap_clamp(iy+2, height);
387                                         nniz = wrap_clamp(iz+2, depth);
388
389                                         ix = wrap_clamp(ix, width);
390                                         iy = wrap_clamp(iy, height);
391                                         iz = wrap_clamp(iz, depth);
392                                         break;
393                         }
394
395                         const int xc[4] = {pix, ix, nix, nnix};
396                         const int yc[4] = {width * piy,
397                                            width * iy,
398                                            width * niy,
399                                            width * nniy};
400                         const int zc[4] = {width * height * piz,
401                                            width * height * iz,
402                                            width * height * niz,
403                                            width * height * nniz};
404                         float u[4], v[4], w[4];
405
406                         /* Some helper macro to keep code reasonable size,
407                          * let compiler to inline all the matrix multiplications.
408                          */
409 #define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
410 #define COL_TERM(col, row) \
411                         (v[col] * (u[0] * DATA(0, col, row) + \
412                                    u[1] * DATA(1, col, row) + \
413                                    u[2] * DATA(2, col, row) + \
414                                    u[3] * DATA(3, col, row)))
415 #define ROW_TERM(row) \
416                         (w[row] * (COL_TERM(0, row) + \
417                                    COL_TERM(1, row) + \
418                                    COL_TERM(2, row) + \
419                                    COL_TERM(3, row)))
420
421                         SET_CUBIC_SPLINE_WEIGHTS(u, tx);
422                         SET_CUBIC_SPLINE_WEIGHTS(v, ty);
423                         SET_CUBIC_SPLINE_WEIGHTS(w, tz);
424
425                         /* Actual interpolation. */
426                         return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
427
428 #undef COL_TERM
429 #undef ROW_TERM
430 #undef DATA
431                 }
432         }
433
434         ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
435         {
436                 width = width_;
437                 height = height_;
438                 depth = depth_;
439         }
440
441         T *data;
442         int interpolation;
443         ExtensionType extension;
444         int width, height, depth;
445 #undef SET_CUBIC_SPLINE_WEIGHTS
446 };
447
448 typedef texture<float4> texture_float4;
449 typedef texture<float2> texture_float2;
450 typedef texture<float> texture_float;
451 typedef texture<uint> texture_uint;
452 typedef texture<int> texture_int;
453 typedef texture<uint4> texture_uint4;
454 typedef texture<uchar4> texture_uchar4;
455 typedef texture_image<float4> texture_image_float4;
456 typedef texture_image<uchar4> texture_image_uchar4;
457
458 /* Macros to handle different memory storage on different devices */
459
460 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
461 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
462 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
463 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
464 #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
465 #define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
466 #define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
467
468 #define kernel_data (kg->__data)
469
470 #ifdef __KERNEL_SSE2__
471 typedef vector3<sseb> sse3b;
472 typedef vector3<ssef> sse3f;
473 typedef vector3<ssei> sse3i;
474
475 ccl_device_inline void print_sse3b(const char *label, sse3b& a)
476 {
477         print_sseb(label, a.x);
478         print_sseb(label, a.y);
479         print_sseb(label, a.z);
480 }
481
482 ccl_device_inline void print_sse3f(const char *label, sse3f& a)
483 {
484         print_ssef(label, a.x);
485         print_ssef(label, a.y);
486         print_ssef(label, a.z);
487 }
488
489 ccl_device_inline void print_sse3i(const char *label, sse3i& a)
490 {
491         print_ssei(label, a.x);
492         print_ssei(label, a.y);
493         print_ssei(label, a.z);
494 }
495
496 #endif
497
498 CCL_NAMESPACE_END
499
500 #endif /* __KERNEL_COMPAT_CPU_H__ */
501