Fix cycles texture crash on win x86-64 + msvc 11
[blender-staging.git] / intern / cycles / kernel / svm / svm_image.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16
17 CCL_NAMESPACE_BEGIN
18
19 #ifdef __KERNEL_OPENCL__
20
21 /* For OpenCL all images are packed in a single array, and we do manual lookup
22  * and interpolation. */
23
24 ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int offset)
25 {
26         uchar4 r = kernel_tex_fetch(__tex_image_packed, offset);
27         float f = 1.0f/255.0f;
28         return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
29 }
30
31 ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
32 {
33         x %= width;
34         if(x < 0)
35                 x += width;
36         return x;
37 }
38
39 ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
40 {
41         return clamp(x, 0, width-1);
42 }
43
44 ccl_device_inline float svm_image_texture_frac(float x, int *ix)
45 {
46         int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
47         *ix = i;
48         return x - (float)i;
49 }
50
51 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
52 {
53         /* first slots are used by float textures, which are not supported here */
54         if(id < TEX_NUM_FLOAT_IMAGES)
55                 return make_float4(1.0f, 0.0f, 1.0f, 1.0f);
56
57         id -= TEX_NUM_FLOAT_IMAGES;
58
59         uint4 info = kernel_tex_fetch(__tex_image_packed_info, id);
60         uint width = info.x;
61         uint height = info.y;
62         uint offset = info.z;
63         uint periodic = info.w;
64
65         int ix, iy, nix, niy;
66         float tx = svm_image_texture_frac(x*width, &ix);
67         float ty = svm_image_texture_frac(y*height, &iy);
68
69         if(periodic) {
70                 ix = svm_image_texture_wrap_periodic(ix, width);
71                 iy = svm_image_texture_wrap_periodic(iy, height);
72
73                 nix = svm_image_texture_wrap_periodic(ix+1, width);
74                 niy = svm_image_texture_wrap_periodic(iy+1, height);
75         }
76         else {
77                 ix = svm_image_texture_wrap_clamp(ix, width);
78                 iy = svm_image_texture_wrap_clamp(iy, height);
79
80                 nix = svm_image_texture_wrap_clamp(ix+1, width);
81                 niy = svm_image_texture_wrap_clamp(iy+1, height);
82         }
83
84         float4 r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + iy*width);
85         r += (1.0f - ty)*tx*svm_image_texture_read(kg, offset + nix + iy*width);
86         r += ty*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + niy*width);
87         r += ty*tx*svm_image_texture_read(kg, offset + nix + niy*width);
88
89         if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
90                 float invw = 1.0f/r.w;
91                 r.x *= invw;
92                 r.y *= invw;
93                 r.z *= invw;
94
95                 if(id >= TEX_NUM_FLOAT_IMAGES) {
96                         r.x = min(r.x, 1.0f);
97                         r.y = min(r.y, 1.0f);
98                         r.z = min(r.z, 1.0f);
99                 }
100         }
101
102         if(srgb) {
103                 r.x = color_srgb_to_scene_linear(r.x);
104                 r.y = color_srgb_to_scene_linear(r.y);
105                 r.z = color_srgb_to_scene_linear(r.z);
106         }
107
108         return r;
109 }
110
111 #else
112
113 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
114 {
115 #if defined(__KERNEL_CPU__) && defined(__KERNEL_SSE2__)
116         union { float4 rgba; __m128 m128; } r = { kernel_tex_image_interp(id, x, y) };
117 #elif defined(__KERNEL_CPU__)
118         float4 r = kernel_tex_image_interp(id, x, y);
119 #else
120         float4 r;
121
122         /* not particularly proud of this massive switch, what are the
123          * alternatives?
124          * - use a single big 1D texture, and do our own lookup/filtering
125          * - group by size and use a 3d texture, performance impact
126          * - group into larger texture with some padding for correct lerp
127          *
128          * also note that cuda has 128 textures limit, we use 100 now, since
129          * we still need some for other storage */
130
131         switch(id) {
132                 case 0: r = kernel_tex_image_interp(__tex_image_float_000, x, y); break;
133                 case 1: r = kernel_tex_image_interp(__tex_image_float_001, x, y); break;
134                 case 2: r = kernel_tex_image_interp(__tex_image_float_002, x, y); break;
135                 case 3: r = kernel_tex_image_interp(__tex_image_float_003, x, y); break;
136                 case 4: r = kernel_tex_image_interp(__tex_image_float_004, x, y); break;
137                 case 5: r = kernel_tex_image_interp(__tex_image_005, x, y); break;
138                 case 6: r = kernel_tex_image_interp(__tex_image_006, x, y); break;
139                 case 7: r = kernel_tex_image_interp(__tex_image_007, x, y); break;
140                 case 8: r = kernel_tex_image_interp(__tex_image_008, x, y); break;
141                 case 9: r = kernel_tex_image_interp(__tex_image_009, x, y); break;
142                 case 10: r = kernel_tex_image_interp(__tex_image_010, x, y); break;
143                 case 11: r = kernel_tex_image_interp(__tex_image_011, x, y); break;
144                 case 12: r = kernel_tex_image_interp(__tex_image_012, x, y); break;
145                 case 13: r = kernel_tex_image_interp(__tex_image_013, x, y); break;
146                 case 14: r = kernel_tex_image_interp(__tex_image_014, x, y); break;
147                 case 15: r = kernel_tex_image_interp(__tex_image_015, x, y); break;
148                 case 16: r = kernel_tex_image_interp(__tex_image_016, x, y); break;
149                 case 17: r = kernel_tex_image_interp(__tex_image_017, x, y); break;
150                 case 18: r = kernel_tex_image_interp(__tex_image_018, x, y); break;
151                 case 19: r = kernel_tex_image_interp(__tex_image_019, x, y); break;
152                 case 20: r = kernel_tex_image_interp(__tex_image_020, x, y); break;
153                 case 21: r = kernel_tex_image_interp(__tex_image_021, x, y); break;
154                 case 22: r = kernel_tex_image_interp(__tex_image_022, x, y); break;
155                 case 23: r = kernel_tex_image_interp(__tex_image_023, x, y); break;
156                 case 24: r = kernel_tex_image_interp(__tex_image_024, x, y); break;
157                 case 25: r = kernel_tex_image_interp(__tex_image_025, x, y); break;
158                 case 26: r = kernel_tex_image_interp(__tex_image_026, x, y); break;
159                 case 27: r = kernel_tex_image_interp(__tex_image_027, x, y); break;
160                 case 28: r = kernel_tex_image_interp(__tex_image_028, x, y); break;
161                 case 29: r = kernel_tex_image_interp(__tex_image_029, x, y); break;
162                 case 30: r = kernel_tex_image_interp(__tex_image_030, x, y); break;
163                 case 31: r = kernel_tex_image_interp(__tex_image_031, x, y); break;
164                 case 32: r = kernel_tex_image_interp(__tex_image_032, x, y); break;
165                 case 33: r = kernel_tex_image_interp(__tex_image_033, x, y); break;
166                 case 34: r = kernel_tex_image_interp(__tex_image_034, x, y); break;
167                 case 35: r = kernel_tex_image_interp(__tex_image_035, x, y); break;
168                 case 36: r = kernel_tex_image_interp(__tex_image_036, x, y); break;
169                 case 37: r = kernel_tex_image_interp(__tex_image_037, x, y); break;
170                 case 38: r = kernel_tex_image_interp(__tex_image_038, x, y); break;
171                 case 39: r = kernel_tex_image_interp(__tex_image_039, x, y); break;
172                 case 40: r = kernel_tex_image_interp(__tex_image_040, x, y); break;
173                 case 41: r = kernel_tex_image_interp(__tex_image_041, x, y); break;
174                 case 42: r = kernel_tex_image_interp(__tex_image_042, x, y); break;
175                 case 43: r = kernel_tex_image_interp(__tex_image_043, x, y); break;
176                 case 44: r = kernel_tex_image_interp(__tex_image_044, x, y); break;
177                 case 45: r = kernel_tex_image_interp(__tex_image_045, x, y); break;
178                 case 46: r = kernel_tex_image_interp(__tex_image_046, x, y); break;
179                 case 47: r = kernel_tex_image_interp(__tex_image_047, x, y); break;
180                 case 48: r = kernel_tex_image_interp(__tex_image_048, x, y); break;
181                 case 49: r = kernel_tex_image_interp(__tex_image_049, x, y); break;
182                 case 50: r = kernel_tex_image_interp(__tex_image_050, x, y); break;
183                 case 51: r = kernel_tex_image_interp(__tex_image_051, x, y); break;
184                 case 52: r = kernel_tex_image_interp(__tex_image_052, x, y); break;
185                 case 53: r = kernel_tex_image_interp(__tex_image_053, x, y); break;
186                 case 54: r = kernel_tex_image_interp(__tex_image_054, x, y); break;
187                 case 55: r = kernel_tex_image_interp(__tex_image_055, x, y); break;
188                 case 56: r = kernel_tex_image_interp(__tex_image_056, x, y); break;
189                 case 57: r = kernel_tex_image_interp(__tex_image_057, x, y); break;
190                 case 58: r = kernel_tex_image_interp(__tex_image_058, x, y); break;
191                 case 59: r = kernel_tex_image_interp(__tex_image_059, x, y); break;
192                 case 60: r = kernel_tex_image_interp(__tex_image_060, x, y); break;
193                 case 61: r = kernel_tex_image_interp(__tex_image_061, x, y); break;
194                 case 62: r = kernel_tex_image_interp(__tex_image_062, x, y); break;
195                 case 63: r = kernel_tex_image_interp(__tex_image_063, x, y); break;
196                 case 64: r = kernel_tex_image_interp(__tex_image_064, x, y); break;
197                 case 65: r = kernel_tex_image_interp(__tex_image_065, x, y); break;
198                 case 66: r = kernel_tex_image_interp(__tex_image_066, x, y); break;
199                 case 67: r = kernel_tex_image_interp(__tex_image_067, x, y); break;
200                 case 68: r = kernel_tex_image_interp(__tex_image_068, x, y); break;
201                 case 69: r = kernel_tex_image_interp(__tex_image_069, x, y); break;
202                 case 70: r = kernel_tex_image_interp(__tex_image_070, x, y); break;
203                 case 71: r = kernel_tex_image_interp(__tex_image_071, x, y); break;
204                 case 72: r = kernel_tex_image_interp(__tex_image_072, x, y); break;
205                 case 73: r = kernel_tex_image_interp(__tex_image_073, x, y); break;
206                 case 74: r = kernel_tex_image_interp(__tex_image_074, x, y); break;
207                 case 75: r = kernel_tex_image_interp(__tex_image_075, x, y); break;
208                 case 76: r = kernel_tex_image_interp(__tex_image_076, x, y); break;
209                 case 77: r = kernel_tex_image_interp(__tex_image_077, x, y); break;
210                 case 78: r = kernel_tex_image_interp(__tex_image_078, x, y); break;
211                 case 79: r = kernel_tex_image_interp(__tex_image_079, x, y); break;
212                 case 80: r = kernel_tex_image_interp(__tex_image_080, x, y); break;
213                 case 81: r = kernel_tex_image_interp(__tex_image_081, x, y); break;
214                 case 82: r = kernel_tex_image_interp(__tex_image_082, x, y); break;
215                 case 83: r = kernel_tex_image_interp(__tex_image_083, x, y); break;
216                 case 84: r = kernel_tex_image_interp(__tex_image_084, x, y); break;
217                 case 85: r = kernel_tex_image_interp(__tex_image_085, x, y); break;
218                 case 86: r = kernel_tex_image_interp(__tex_image_086, x, y); break;
219                 case 87: r = kernel_tex_image_interp(__tex_image_087, x, y); break;
220                 case 88: r = kernel_tex_image_interp(__tex_image_088, x, y); break;
221                 case 89: r = kernel_tex_image_interp(__tex_image_089, x, y); break;
222                 case 90: r = kernel_tex_image_interp(__tex_image_090, x, y); break;
223                 case 91: r = kernel_tex_image_interp(__tex_image_091, x, y); break;
224                 case 92: r = kernel_tex_image_interp(__tex_image_092, x, y); break;
225                 case 93: r = kernel_tex_image_interp(__tex_image_093, x, y); break;
226                 case 94: r = kernel_tex_image_interp(__tex_image_094, x, y); break;
227                 case 95: r = kernel_tex_image_interp(__tex_image_095, x, y); break;
228                 case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break;
229                 case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
230                 case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
231                 case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
232                 default: 
233                         kernel_assert(0);
234                         return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
235         }
236 #endif
237
238 #ifdef __KERNEL_SSE2__
239         if(use_alpha && r.rgba.w != 1.0f && r.rgba.w != 0.0f) {
240                 float alpha = r.rgba.w;
241                 r.m128 = _mm_div_ps(r.m128, _mm_set1_ps(alpha));
242                 if(id >= TEX_NUM_FLOAT_IMAGES)
243                         r.m128 = _mm_min_ps(r.m128, _mm_set1_ps(1.0f));
244                 r.rgba.w = alpha;
245         }
246
247         if(srgb) {
248                 float alpha = r.rgba.w;
249                 r.m128 = color_srgb_to_scene_linear(r.m128);
250                 r.rgba.w = alpha;
251         }
252
253         return r.rgba;
254 #else
255         if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
256                 float invw = 1.0f/r.w;
257                 r.x *= invw;
258                 r.y *= invw;
259                 r.z *= invw;
260
261                 if(id >= TEX_NUM_FLOAT_IMAGES) {
262                         r.x = min(r.x, 1.0f);
263                         r.y = min(r.y, 1.0f);
264                         r.z = min(r.z, 1.0f);
265                 }
266         }
267
268         if(srgb) {
269                 r.x = color_srgb_to_scene_linear(r.x);
270                 r.y = color_srgb_to_scene_linear(r.y);
271                 r.z = color_srgb_to_scene_linear(r.z);
272         }
273
274         return r;
275 #endif
276 }
277
278 #endif
279
280 ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
281 {
282         uint id = node.y;
283         uint co_offset, out_offset, alpha_offset, srgb;
284
285         decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
286
287         float3 co = stack_load_float3(stack, co_offset);
288         uint use_alpha = stack_valid(alpha_offset);
289         float4 f = svm_image_texture(kg, id, co.x, co.y, srgb, use_alpha);
290
291         if(stack_valid(out_offset))
292                 stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
293         if(stack_valid(alpha_offset))
294                 stack_store_float(stack, alpha_offset, f.w);
295 }
296
297 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
298 {
299         /* get object space normal */
300         float3 N = sd->N;
301
302         N = sd->N;
303         if(sd->object != ~0)
304                 object_inverse_normal_transform(kg, sd, &N);
305
306         /* project from direction vector to barycentric coordinates in triangles */
307         N.x = fabsf(N.x);
308         N.y = fabsf(N.y);
309         N.z = fabsf(N.z);
310
311         N /= (N.x + N.y + N.z);
312
313         /* basic idea is to think of this as a triangle, each corner representing
314          * one of the 3 faces of the cube. in the corners we have single textures,
315          * in between we blend between two textures, and in the middle we a blend
316          * between three textures.
317          *
318          * the Nxyz values are the barycentric coordinates in an equilateral
319          * triangle, which in case of blending, in the middle has a smaller
320          * equilateral triangle where 3 textures blend. this divides things into
321          * 7 zones, with an if() test for each zone */
322
323         float3 weight = make_float3(0.0f, 0.0f, 0.0f);
324         float blend = __int_as_float(node.w);
325         float limit = 0.5f*(1.0f + blend);
326
327         /* first test for corners with single texture */
328         if(N.x > limit*(N.x + N.y) && N.x > limit*(N.x + N.z)) {
329                 weight.x = 1.0f;
330         }
331         else if(N.y > limit*(N.x + N.y) && N.y > limit*(N.y + N.z)) {
332                 weight.y = 1.0f;
333         }
334         else if(N.z > limit*(N.x + N.z) && N.z > limit*(N.y + N.z)) {
335                 weight.z = 1.0f;
336         }
337         else if(blend > 0.0f) {
338                 /* in case of blending, test for mixes between two textures */
339                 if(N.z < (1.0f - limit)*(N.y + N.x)) {
340                         weight.x = N.x/(N.x + N.y);
341                         weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
342                         weight.y = 1.0f - weight.x;
343                 }
344                 else if(N.x < (1.0f - limit)*(N.y + N.z)) {
345                         weight.y = N.y/(N.y + N.z);
346                         weight.y = clamp((weight.y - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
347                         weight.z = 1.0f - weight.y;
348                 }
349                 else if(N.y < (1.0f - limit)*(N.x + N.z)) {
350                         weight.x = N.x/(N.x + N.z);
351                         weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
352                         weight.z = 1.0f - weight.x;
353                 }
354                 else {
355                         /* last case, we have a mix between three */
356                         weight.x = ((2.0f - limit)*N.x + (limit - 1.0f))/(2.0f*limit - 1.0f);
357                         weight.y = ((2.0f - limit)*N.y + (limit - 1.0f))/(2.0f*limit - 1.0f);
358                         weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f);
359                 }
360         }
361
362         /* now fetch textures */
363         uint co_offset, out_offset, alpha_offset, srgb;
364         decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
365
366         float3 co = stack_load_float3(stack, co_offset);
367         uint id = node.y;
368
369         float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
370         uint use_alpha = stack_valid(alpha_offset);
371
372         if(weight.x > 0.0f)
373                 f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha);
374         if(weight.y > 0.0f)
375                 f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha);
376         if(weight.z > 0.0f)
377                 f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha);
378
379         if(stack_valid(out_offset))
380                 stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
381         if(stack_valid(alpha_offset))
382                 stack_store_float(stack, alpha_offset, f.w);
383 }
384
385
386 ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
387 {
388         uint id = node.y;
389         uint co_offset, out_offset, alpha_offset, srgb;
390         uint projection = node.w;
391
392         decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
393
394         float3 co = stack_load_float3(stack, co_offset);
395         float2 uv;
396
397         co = normalize(co);
398         
399         if(projection == 0)
400                 uv = direction_to_equirectangular(co);
401         else
402                 uv = direction_to_mirrorball(co);
403
404         uint use_alpha = stack_valid(alpha_offset);
405         float4 f = svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
406
407         if(stack_valid(out_offset))
408                 stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
409         if(stack_valid(alpha_offset))
410                 stack_store_float(stack, alpha_offset, f.w);
411 }
412
413 CCL_NAMESPACE_END
414