CCL_NAMESPACE_BEGIN
-__device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y)
+#ifdef __KERNEL_OPENCL__
+
+/* For OpenCL all images are packed in a single array, and we do manual lookup
+ * and interpolation. */
+
+__device_inline float4 svm_image_texture_read(KernelGlobals *kg, int offset)
{
- float4 r;
+ uchar4 r = kernel_tex_fetch(__tex_image_packed, offset);
+ float f = 1.0f/255.0f;
+ return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
+}
- /* not particularly proud of this massive switch, what are the
- alternatives?
- - use a single big 1D texture, and do our own lookup/filtering
- - group by size and use a 3d texture, performance impact
- - group into larger texture with some padding for correct lerp
+__device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+ x %= width;
+ if(x < 0)
+ x += width;
+ return x;
+}
- also note that cuda has 128 textures limit, we use 100 now, since
- we still need some for other storage */
+__device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+ return clamp(x, 0, width-1);
+}
-#ifdef __KERNEL_OPENCL__
- r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* todo */
+__device_inline float svm_image_texture_frac(float x, int *ix)
+{
+ int i = (int)x - ((x < 0.0f)? 1: 0);
+ *ix = i;
+ return x - (float)i;
+}
+
+__device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb)
+{
+ uint4 info = kernel_tex_fetch(__tex_image_packed_info, id);
+ uint width = info.x;
+ uint height = info.y;
+ uint offset = info.z;
+ uint periodic = info.w;
+
+ int ix, iy, nix, niy;
+ float tx = svm_image_texture_frac(x*width, &ix);
+ float ty = svm_image_texture_frac(y*height, &iy);
+
+ if(periodic) {
+ ix = svm_image_texture_wrap_periodic(ix, width);
+ iy = svm_image_texture_wrap_periodic(iy, height);
+
+ nix = svm_image_texture_wrap_periodic(ix+1, width);
+ niy = svm_image_texture_wrap_periodic(iy+1, height);
+ }
+ else {
+ ix = svm_image_texture_wrap_clamp(ix, width);
+ iy = svm_image_texture_wrap_clamp(iy, height);
+
+ nix = svm_image_texture_wrap_clamp(ix+1, width);
+ niy = svm_image_texture_wrap_clamp(iy+1, height);
+ }
+
+ float4 r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + iy*width);
+ r += (1.0f - ty)*tx*svm_image_texture_read(kg, offset + nix + iy*width);
+ r += ty*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + niy*width);
+ r += ty*tx*svm_image_texture_read(kg, offset + nix + niy*width);
+
+ if(srgb) {
+ r.x = color_srgb_to_scene_linear(r.x);
+ r.y = color_srgb_to_scene_linear(r.y);
+ r.z = color_srgb_to_scene_linear(r.z);
+ }
+
+ return r;
+}
+
+#else
+
+__device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb)
+{
+ float4 r;
+
+#ifdef __KERNEL_CPU__
+ r = kernel_tex_image_interp(id, x, y);
#else
+ /* not particularly proud of this massive switch, what are the
+ * alternatives?
+ * - use a single big 1D texture, and do our own lookup/filtering
+ * - group by size and use a 3d texture, performance impact
+ * - group into larger texture with some padding for correct lerp
+ *
+ * also note that cuda has 128 textures limit, we use 100 now, since
+ * we still need some for other storage */
+
switch(id) {
- case 0: r = kernel_tex_image_interp(__tex_image_000, x, y); break;
- case 1: r = kernel_tex_image_interp(__tex_image_001, x, y); break;
- case 2: r = kernel_tex_image_interp(__tex_image_002, x, y); break;
- case 3: r = kernel_tex_image_interp(__tex_image_003, x, y); break;
- case 4: r = kernel_tex_image_interp(__tex_image_004, x, y); break;
+ case 0: r = kernel_tex_image_interp(__tex_image_float_000, x, y); break;
+ case 1: r = kernel_tex_image_interp(__tex_image_float_001, x, y); break;
+ case 2: r = kernel_tex_image_interp(__tex_image_float_002, x, y); break;
+ case 3: r = kernel_tex_image_interp(__tex_image_float_003, x, y); break;
+ case 4: r = kernel_tex_image_interp(__tex_image_float_004, x, y); break;
case 5: r = kernel_tex_image_interp(__tex_image_005, x, y); break;
case 6: r = kernel_tex_image_interp(__tex_image_006, x, y); break;
case 7: r = kernel_tex_image_interp(__tex_image_007, x, y); break;
}
#endif
+ if(srgb) {
+ r.x = color_srgb_to_scene_linear(r.x);
+ r.y = color_srgb_to_scene_linear(r.y);
+ r.z = color_srgb_to_scene_linear(r.z);
+ }
+
return r;
}
+#endif
+
__device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
{
uint id = node.y;
decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
float3 co = stack_load_float3(stack, co_offset);
- float4 f = svm_image_texture(kg, id, co.x, co.y);
- float3 r = make_float3(f.x, f.y, f.z);
+ float4 f = svm_image_texture(kg, id, co.x, co.y, srgb);
- if(srgb) {
- r.x = color_srgb_to_scene_linear(r.x);
- r.y = color_srgb_to_scene_linear(r.y);
- r.z = color_srgb_to_scene_linear(r.z);
+ if(stack_valid(out_offset))
+ stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
+ if(stack_valid(alpha_offset))
+ stack_store_float(stack, alpha_offset, f.w);
+}
+
+__device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+{
+ /* get object space normal */
+ float3 N = sd->N;
+
+ N = sd->N;
+ if(sd->object != ~0)
+ object_inverse_normal_transform(kg, sd, &N);
+
+ /* project from direction vector to barycentric coordinates in triangles */
+ N.x = fabsf(N.x);
+ N.y = fabsf(N.y);
+ N.z = fabsf(N.z);
+
+ N /= (N.x + N.y + N.z);
+
+ /* basic idea is to think of this as a triangle, each corner representing
+ * one of the 3 faces of the cube. in the corners we have single textures,
+ * in between we blend between two textures, and in the middle we a blend
+ * between three textures.
+ *
+ * the Nxyz values are the barycentric coordinates in an equilateral
+ * triangle, which in case of blending in the middle has a smaller
+ * equilateral triangle where 3 textures blend. this divides things into
+ * 7 zones, with an if() test for each zone */
+
+ float3 weight = make_float3(0.0f, 0.0f, 0.0f);
+ float blend = __int_as_float(node.w);
+ float limit = 0.5f*(1.0f + blend);
+
+ /* first test for corners with single texture */
+ if(N.x > limit*(N.x + N.y) && N.x > limit*(N.x + N.z)) {
+ weight.x = 1.0f;
+ }
+ else if(N.y > limit*(N.x + N.y) && N.y > limit*(N.y + N.z)) {
+ weight.y = 1.0f;
+ }
+ else if(N.z > limit*(N.x + N.z) && N.z > limit*(N.y + N.z)) {
+ weight.z = 1.0f;
+ }
+ else if(blend > 0.0f) {
+ /* in case of blending, test for mixes between two textures */
+ if(N.z < (1.0f - limit)*(N.y + N.x)) {
+ weight.x = N.x/(N.x + N.y);
+ weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+ weight.y = 1.0f - weight.x;
+ }
+ else if(N.x < (1.0f - limit)*(N.y + N.z)) {
+ weight.y = N.y/(N.y + N.z);
+ weight.y = clamp((weight.y - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+ weight.z = 1.0f - weight.y;
+ }
+ else if(N.y < (1.0f - limit)*(N.x + N.z)) {
+ weight.x = N.x/(N.x + N.z);
+ weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+ weight.z = 1.0f - weight.x;
+ }
+ else {
+ /* last case, we have a mix between three */
+ weight.x = ((2.0f - limit)*N.x + (limit - 1.0f))/(2.0f*limit - 1.0f);
+ weight.y = ((2.0f - limit)*N.y + (limit - 1.0f))/(2.0f*limit - 1.0f);
+ weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f);
+ }
}
+ /* now fetch textures */
+ uint co_offset, out_offset, alpha_offset, srgb;
+ decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
+
+ float3 co = stack_load_float3(stack, co_offset);
+ uint id = node.y;
+
+ float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+ if(weight.x > 0.0f)
+ f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb);
+ if(weight.y > 0.0f)
+ f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb);
+ if(weight.z > 0.0f)
+ f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb);
+
if(stack_valid(out_offset))
- stack_store_float3(stack, out_offset, r);
+ stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
if(stack_valid(alpha_offset))
stack_store_float(stack, alpha_offset, f.w);
}
+
__device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
{
uint id = node.y;
uint co_offset, out_offset, alpha_offset, srgb;
+ uint projection = node.w;
decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
float3 co = stack_load_float3(stack, co_offset);
- float u = (atan2f(co.y, co.x) + M_PI_F)/(2*M_PI_F);
- float v = atan2f(co.z, hypotf(co.x, co.y))/M_PI_F + 0.5f;
- float4 f = svm_image_texture(kg, id, u, v);
- float3 r = make_float3(f.x, f.y, f.z);
+ float2 uv;
- if(srgb) {
- r.x = color_srgb_to_scene_linear(r.x);
- r.y = color_srgb_to_scene_linear(r.y);
- r.z = color_srgb_to_scene_linear(r.z);
- }
+ co = normalize(co);
+
+ if(projection == 0)
+ uv = direction_to_equirectangular(co);
+ else
+ uv = direction_to_mirrorball(co);
+
+ float4 f = svm_image_texture(kg, id, uv.x, uv.y, srgb);
if(stack_valid(out_offset))
- stack_store_float3(stack, out_offset, r);
+ stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
if(stack_valid(alpha_offset))
stack_store_float(stack, alpha_offset, f.w);
}