Cycles: Initial support of 3D textures for CUDA rendering
[blender-staging.git] / intern / cycles / kernel / svm / svm_image.h
index 662419418e3f10399189bb59eb0658abc3552447..86d3262795fd8199802adc88fa4db201262a3105 100644 (file)
@@ -1,19 +1,17 @@
 /*
- * Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -23,14 +21,14 @@ CCL_NAMESPACE_BEGIN
 /* For OpenCL all images are packed in a single array, and we do manual lookup
  * and interpolation. */
 
-__device_inline float4 svm_image_texture_read(KernelGlobals *kg, int offset)
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int offset)
 {
        uchar4 r = kernel_tex_fetch(__tex_image_packed, offset);
        float f = 1.0f/255.0f;
        return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
 }
 
-__device_inline int svm_image_texture_wrap_periodic(int x, int width)
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
 {
        x %= width;
        if(x < 0)
@@ -38,49 +36,88 @@ __device_inline int svm_image_texture_wrap_periodic(int x, int width)
        return x;
 }
 
-__device_inline int svm_image_texture_wrap_clamp(int x, int width)
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
 {
        return clamp(x, 0, width-1);
 }
 
-__device_inline float svm_image_texture_frac(float x, int *ix)
+ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 {
-       int i = (int)x - ((x < 0.0f)? 1: 0);
+       int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
        *ix = i;
        return x - (float)i;
 }
 
-__device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb)
+ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
+       /* first slots are used by float textures, which are not supported here */
+       if(id < TEX_NUM_FLOAT_IMAGES)
+               return make_float4(1.0f, 0.0f, 1.0f, 1.0f);
+
+       id -= TEX_NUM_FLOAT_IMAGES;
+
        uint4 info = kernel_tex_fetch(__tex_image_packed_info, id);
        uint width = info.x;
        uint height = info.y;
        uint offset = info.z;
-       uint periodic = info.w;
+       uint periodic = (info.w & 0x1);
+       uint interpolation = info.w >> 1;
 
+       float4 r;
        int ix, iy, nix, niy;
-       float tx = svm_image_texture_frac(x*width, &ix);
-       float ty = svm_image_texture_frac(y*height, &iy);
+       if(interpolation == INTERPOLATION_CLOSEST) {
+               svm_image_texture_frac(x*width, &ix);
+               svm_image_texture_frac(y*height, &iy);
 
-       if(periodic) {
-               ix = svm_image_texture_wrap_periodic(ix, width);
-               iy = svm_image_texture_wrap_periodic(iy, height);
+               if(periodic) {
+                       ix = svm_image_texture_wrap_periodic(ix, width);
+                       iy = svm_image_texture_wrap_periodic(iy, height);
+               }
+               else {
+                       ix = svm_image_texture_wrap_clamp(ix, width);
+                       iy = svm_image_texture_wrap_clamp(iy, height);
 
-               nix = svm_image_texture_wrap_periodic(ix+1, width);
-               niy = svm_image_texture_wrap_periodic(iy+1, height);
+               }
+               r = svm_image_texture_read(kg, offset + ix + iy*width);
        }
-       else {
-               ix = svm_image_texture_wrap_clamp(ix, width);
-               iy = svm_image_texture_wrap_clamp(iy, height);
+       else { /* We default to linear interpolation if it is not closest */
+               float tx = svm_image_texture_frac(x*width, &ix);
+               float ty = svm_image_texture_frac(y*height, &iy);
+
+               if(periodic) {
+                       ix = svm_image_texture_wrap_periodic(ix, width);
+                       iy = svm_image_texture_wrap_periodic(iy, height);
+
+                       nix = svm_image_texture_wrap_periodic(ix+1, width);
+                       niy = svm_image_texture_wrap_periodic(iy+1, height);
+               }
+               else {
+                       ix = svm_image_texture_wrap_clamp(ix, width);
+                       iy = svm_image_texture_wrap_clamp(iy, height);
+
+                       nix = svm_image_texture_wrap_clamp(ix+1, width);
+                       niy = svm_image_texture_wrap_clamp(iy+1, height);
+               }
+
 
-               nix = svm_image_texture_wrap_clamp(ix+1, width);
-               niy = svm_image_texture_wrap_clamp(iy+1, height);
+               r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + iy*width);
+               r += (1.0f - ty)*tx*svm_image_texture_read(kg, offset + nix + iy*width);
+               r += ty*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + niy*width);
+               r += ty*tx*svm_image_texture_read(kg, offset + nix + niy*width);
        }
 
-       float4 r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + iy*width);
-       r += (1.0f - ty)*tx*svm_image_texture_read(kg, offset + nix + iy*width);
-       r += ty*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + niy*width);
-       r += ty*tx*svm_image_texture_read(kg, offset + nix + niy*width);
+       if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
+               float invw = 1.0f/r.w;
+               r.x *= invw;
+               r.y *= invw;
+               r.z *= invw;
+
+               if(id >= TEX_NUM_FLOAT_IMAGES) {
+                       r.x = min(r.x, 1.0f);
+                       r.y = min(r.y, 1.0f);
+                       r.z = min(r.z, 1.0f);
+               }
+       }
 
        if(srgb) {
                r.x = color_srgb_to_scene_linear(r.x);
@@ -93,21 +130,27 @@ __device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, u
 
 #else
 
-__device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb)
+ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
-       float4 r;
-
 #ifdef __KERNEL_CPU__
+#ifdef __KERNEL_SSE2__
+       ssef r_ssef;
+       float4 &r = (float4 &)r_ssef;
        r = kernel_tex_image_interp(id, x, y);
 #else
+       float4 r = kernel_tex_image_interp(id, x, y);
+#endif
+#else
+       float4 r;
+
        /* not particularly proud of this massive switch, what are the
         * alternatives?
         * - use a single big 1D texture, and do our own lookup/filtering
         * - group by size and use a 3d texture, performance impact
         * - group into larger texture with some padding for correct lerp
         *
-        * also note that cuda has 128 textures limit, we use 100 now, since
-        * we still need some for other storage */
+        * also note that cuda has a textures limit (128 for Fermi, 256 for Kepler),
+        * and we cannot use all since we still need some for other storage */
 
        switch(id) {
                case 0: r = kernel_tex_image_interp(__tex_image_float_000, x, y); break;
@@ -203,6 +246,8 @@ __device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, u
                case 90: r = kernel_tex_image_interp(__tex_image_090, x, y); break;
                case 91: r = kernel_tex_image_interp(__tex_image_091, x, y); break;
                case 92: r = kernel_tex_image_interp(__tex_image_092, x, y); break;
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
                case 93: r = kernel_tex_image_interp(__tex_image_093, x, y); break;
                case 94: r = kernel_tex_image_interp(__tex_image_094, x, y); break;
                case 95: r = kernel_tex_image_interp(__tex_image_095, x, y); break;
@@ -210,24 +255,112 @@ __device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, u
                case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
                case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
                case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
-               default: 
+               case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
+               case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break;
+               case 102: r = kernel_tex_image_interp(__tex_image_102, x, y); break;
+               case 103: r = kernel_tex_image_interp(__tex_image_103, x, y); break;
+               case 104: r = kernel_tex_image_interp(__tex_image_104, x, y); break;
+               case 105: r = kernel_tex_image_interp(__tex_image_105, x, y); break;
+               case 106: r = kernel_tex_image_interp(__tex_image_106, x, y); break;
+               case 107: r = kernel_tex_image_interp(__tex_image_107, x, y); break;
+               case 108: r = kernel_tex_image_interp(__tex_image_108, x, y); break;
+               case 109: r = kernel_tex_image_interp(__tex_image_109, x, y); break;
+               case 110: r = kernel_tex_image_interp(__tex_image_110, x, y); break;
+               case 111: r = kernel_tex_image_interp(__tex_image_111, x, y); break;
+               case 112: r = kernel_tex_image_interp(__tex_image_112, x, y); break;
+               case 113: r = kernel_tex_image_interp(__tex_image_113, x, y); break;
+               case 114: r = kernel_tex_image_interp(__tex_image_114, x, y); break;
+               case 115: r = kernel_tex_image_interp(__tex_image_115, x, y); break;
+               case 116: r = kernel_tex_image_interp(__tex_image_116, x, y); break;
+               case 117: r = kernel_tex_image_interp(__tex_image_117, x, y); break;
+               case 118: r = kernel_tex_image_interp(__tex_image_118, x, y); break;
+               case 119: r = kernel_tex_image_interp(__tex_image_119, x, y); break;
+               case 120: r = kernel_tex_image_interp(__tex_image_120, x, y); break;
+               case 121: r = kernel_tex_image_interp(__tex_image_121, x, y); break;
+               case 122: r = kernel_tex_image_interp(__tex_image_122, x, y); break;
+               case 123: r = kernel_tex_image_interp(__tex_image_123, x, y); break;
+               case 124: r = kernel_tex_image_interp(__tex_image_124, x, y); break;
+               case 125: r = kernel_tex_image_interp(__tex_image_125, x, y); break;
+               case 126: r = kernel_tex_image_interp(__tex_image_126, x, y); break;
+               case 127: r = kernel_tex_image_interp(__tex_image_127, x, y); break;
+               case 128: r = kernel_tex_image_interp(__tex_image_128, x, y); break;
+               case 129: r = kernel_tex_image_interp(__tex_image_129, x, y); break;
+               case 130: r = kernel_tex_image_interp(__tex_image_130, x, y); break;
+               case 131: r = kernel_tex_image_interp(__tex_image_131, x, y); break;
+               case 132: r = kernel_tex_image_interp(__tex_image_132, x, y); break;
+               case 133: r = kernel_tex_image_interp(__tex_image_133, x, y); break;
+               case 134: r = kernel_tex_image_interp(__tex_image_134, x, y); break;
+               case 135: r = kernel_tex_image_interp(__tex_image_135, x, y); break;
+               case 136: r = kernel_tex_image_interp(__tex_image_136, x, y); break;
+               case 137: r = kernel_tex_image_interp(__tex_image_137, x, y); break;
+               case 138: r = kernel_tex_image_interp(__tex_image_138, x, y); break;
+               case 139: r = kernel_tex_image_interp(__tex_image_139, x, y); break;
+               case 140: r = kernel_tex_image_interp(__tex_image_140, x, y); break;
+               case 141: r = kernel_tex_image_interp(__tex_image_141, x, y); break;
+               case 142: r = kernel_tex_image_interp(__tex_image_142, x, y); break;
+               case 143: r = kernel_tex_image_interp(__tex_image_143, x, y); break;
+               case 144: r = kernel_tex_image_interp(__tex_image_144, x, y); break;
+               case 145: r = kernel_tex_image_interp(__tex_image_145, x, y); break;
+               case 146: r = kernel_tex_image_interp(__tex_image_146, x, y); break;
+               case 147: r = kernel_tex_image_interp(__tex_image_147, x, y); break;
+               case 148: r = kernel_tex_image_interp(__tex_image_148, x, y); break;
+               case 149: r = kernel_tex_image_interp(__tex_image_149, x, y); break;
+               case 150: r = kernel_tex_image_interp(__tex_image_150, x, y); break;
+#endif
+
+               default:
                        kernel_assert(0);
                        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        }
 #endif
 
+#ifdef __KERNEL_SSE2__
+       float alpha = r.w;
+
+       if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
+               r_ssef = r_ssef / ssef(alpha);
+               if(id >= TEX_NUM_FLOAT_IMAGES)
+                       r_ssef = min(r_ssef, ssef(1.0f));
+               r.w = alpha;
+       }
+
+       if(srgb) {
+               r_ssef = color_srgb_to_scene_linear(r_ssef);
+               r.w = alpha;
+       }
+#else
+       if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
+               float invw = 1.0f/r.w;
+               r.x *= invw;
+               r.y *= invw;
+               r.z *= invw;
+
+               if(id >= TEX_NUM_FLOAT_IMAGES) {
+                       r.x = min(r.x, 1.0f);
+                       r.y = min(r.y, 1.0f);
+                       r.z = min(r.z, 1.0f);
+               }
+       }
+
        if(srgb) {
                r.x = color_srgb_to_scene_linear(r.x);
                r.y = color_srgb_to_scene_linear(r.y);
                r.z = color_srgb_to_scene_linear(r.z);
        }
+#endif
 
        return r;
 }
 
 #endif
 
-__device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+/* Remap coordnate from 0..1 box to -1..-1 */
+ccl_device_inline float3 texco_remap_square(float3 co)
+{
+       return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
+}
+
+ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
        uint id = node.y;
        uint co_offset, out_offset, alpha_offset, srgb;
@@ -235,7 +368,20 @@ __device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack
        decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
 
        float3 co = stack_load_float3(stack, co_offset);
-       float4 f = svm_image_texture(kg, id, co.x, co.y, srgb);
+       float2 tex_co;
+       uint use_alpha = stack_valid(alpha_offset);
+       if(node.w == NODE_IMAGE_PROJ_SPHERE) {
+               co = texco_remap_square(co);
+               tex_co = map_to_sphere(co);
+       }
+       else if(node.w == NODE_IMAGE_PROJ_TUBE) {
+               co = texco_remap_square(co);
+               tex_co = map_to_tube(co);
+       }
+       else {
+               tex_co = make_float2(co.x, co.y);
+       }
+       float4 f = svm_image_texture(kg, id, tex_co.x, tex_co.y, srgb, use_alpha);
 
        if(stack_valid(out_offset))
                stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
@@ -243,13 +389,13 @@ __device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack
                stack_store_float(stack, alpha_offset, f.w);
 }
 
-__device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
        /* get object space normal */
-       float3 N = sd->N;
+       float3 N = ccl_fetch(sd, N);
 
-       N = sd->N;
-       if(sd->object != ~0)
+       N = ccl_fetch(sd, N);
+       if(ccl_fetch(sd, object) != OBJECT_NONE)
                object_inverse_normal_transform(kg, sd, &N);
 
        /* project from direction vector to barycentric coordinates in triangles */
@@ -265,7 +411,7 @@ __device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *s
         * between three textures.
         *
         * the Nxyz values are the barycentric coordinates in an equilateral
-        * triangle, which in case of blending in the middle has a smaller
+        * triangle, which in case of blending, in the middle has a smaller
         * equilateral triangle where 3 textures blend. this divides things into
         * 7 zones, with an if() test for each zone */
 
@@ -287,17 +433,17 @@ __device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *s
                /* in case of blending, test for mixes between two textures */
                if(N.z < (1.0f - limit)*(N.y + N.x)) {
                        weight.x = N.x/(N.x + N.y);
-                       weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+                       weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend);
                        weight.y = 1.0f - weight.x;
                }
                else if(N.x < (1.0f - limit)*(N.y + N.z)) {
                        weight.y = N.y/(N.y + N.z);
-                       weight.y = clamp((weight.y - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+                       weight.y = saturate((weight.y - 0.5f*(1.0f - blend))/blend);
                        weight.z = 1.0f - weight.y;
                }
                else if(N.y < (1.0f - limit)*(N.x + N.z)) {
                        weight.x = N.x/(N.x + N.z);
-                       weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+                       weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend);
                        weight.z = 1.0f - weight.x;
                }
                else {
@@ -307,6 +453,10 @@ __device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *s
                        weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f);
                }
        }
+       else {
+               /* Desperate mode, no valid choice anyway, fallback to one side.*/
+               weight.x = 1.0f;
+       }
 
        /* now fetch textures */
        uint co_offset, out_offset, alpha_offset, srgb;
@@ -316,13 +466,14 @@ __device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *s
        uint id = node.y;
 
        float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+       uint use_alpha = stack_valid(alpha_offset);
 
        if(weight.x > 0.0f)
-               f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb);
+               f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha);
        if(weight.y > 0.0f)
-               f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb);
+               f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha);
        if(weight.z > 0.0f)
-               f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb);
+               f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha);
 
        if(stack_valid(out_offset))
                stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
@@ -330,8 +481,7 @@ __device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *s
                stack_store_float(stack, alpha_offset, f.w);
 }
 
-
-__device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
        uint id = node.y;
        uint co_offset, out_offset, alpha_offset, srgb;
@@ -349,7 +499,8 @@ __device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float
        else
                uv = direction_to_mirrorball(co);
 
-       float4 f = svm_image_texture(kg, id, uv.x, uv.y, srgb);
+       uint use_alpha = stack_valid(alpha_offset);
+       float4 f = svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
 
        if(stack_valid(out_offset))
                stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));