Cycles: Initial support of 3D textures for CUDA rendering
authorSergey Sharybin <sergey.vfx@gmail.com>
Mon, 15 Feb 2016 14:40:39 +0000 (15:40 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Mon, 15 Feb 2016 20:26:29 +0000 (21:26 +0100)
Supports both smoke/fire and point density textures now.

Reduces number of textures available for sm_20 and sm_21, but you have
to compromise somewhere on such a limited hardware.

Currently limited to linear interpolation only, and decoupled ray
marching is not supported yet. Think those could be considered just a
further improvement.

Some quick example:

  https://developer.blender.org/F282934

Code is minimal and we can fully consider it a fix for missing
support of 3D textures with CUDA.

Reviewers: lukasstockner97, brecht, juicyfruit, dingto

Reviewed By: brecht, juicyfruit, dingto

Subscribers: mib2berlin

Differential Revision: https://developer.blender.org/D1806

intern/cycles/device/device_cuda.cpp
intern/cycles/kernel/geom/geom_volume.h
intern/cycles/kernel/kernel_compat_cuda.h
intern/cycles/kernel/kernel_textures.h
intern/cycles/kernel/svm/svm.h
intern/cycles/kernel/svm/svm_image.h
intern/cycles/kernel/svm/svm_voxel.h
intern/cycles/render/image.h

index 3e3cd7515c70d9d16d129b06320c9b844d4095e4..98997ae0968710d17ac0980e955a1ffa629e36f8 100644 (file)
@@ -474,9 +474,20 @@ public:
                       InterpolationType interpolation,
                       ExtensionType extension)
        {
-               /* todo: support 3D textures, only CPU for now */
                VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
+               string bind_name = name;
+               if(mem.data_depth > 1) {
+                       /* Kernel uses different bind names for 2d and 3d float textures,
+                        * so we have to adjust couple of things here.
+                        */
+                       vector<string> tokens;
+                       string_split(tokens, name, "_");
+                       bind_name = string_printf("__tex_image_%s3d_%s",
+                                                 tokens[2].c_str(),
+                                                 tokens[3].c_str());
+               }
+
                /* determine format */
                CUarray_format_enum format;
                size_t dsize = datatype_size(mem.data_type);
@@ -496,7 +507,7 @@ public:
                        CUtexref texref = NULL;
 
                        cuda_push_context();
-                       cuda_assert(cuModuleGetTexRef(&texref, cuModule, name));
+                       cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
 
                        if(!texref) {
                                cuda_pop_context();
@@ -505,20 +516,49 @@ public:
 
                        if(interpolation != INTERPOLATION_NONE) {
                                CUarray handle = NULL;
-                               CUDA_ARRAY_DESCRIPTOR desc;
 
-                               desc.Width = mem.data_width;
-                               desc.Height = mem.data_height;
-                               desc.Format = format;
-                               desc.NumChannels = mem.data_elements;
+                               if(mem.data_depth > 1) {
+                                       CUDA_ARRAY3D_DESCRIPTOR desc;
+
+                                       desc.Width = mem.data_width;
+                                       desc.Height = mem.data_height;
+                                       desc.Depth = mem.data_depth;
+                                       desc.Format = format;
+                                       desc.NumChannels = mem.data_elements;
+                                       desc.Flags = 0;
+
+                                       cuda_assert(cuArray3DCreate(&handle, &desc));
+                               }
+                               else {
+                                       CUDA_ARRAY_DESCRIPTOR desc;
 
-                               cuda_assert(cuArrayCreate(&handle, &desc));
+                                       desc.Width = mem.data_width;
+                                       desc.Height = mem.data_height;
+                                       desc.Format = format;
+                                       desc.NumChannels = mem.data_elements;
+
+                                       cuda_assert(cuArrayCreate(&handle, &desc));
+                               }
 
                                if(!handle) {
                                        cuda_pop_context();
                                        return;
                                }
 
+                               if(mem.data_depth > 1) {
+                                       CUDA_MEMCPY3D param;
+                                       memset(&param, 0, sizeof(param));
+                                       param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                                       param.dstArray = handle;
+                                       param.srcMemoryType = CU_MEMORYTYPE_HOST;
+                                       param.srcHost = (void*)mem.data_pointer;
+                                       param.srcPitch = mem.data_width*dsize*mem.data_elements;
+                                       param.WidthInBytes = param.srcPitch;
+                                       param.Height = mem.data_height;
+                                       param.Depth = mem.data_depth;
+
+                                       cuda_assert(cuMemcpy3D(&param));
+                               }
                                if(mem.data_height > 1) {
                                        CUDA_MEMCPY2D param;
                                        memset(&param, 0, sizeof(param));
@@ -595,7 +635,7 @@ public:
                        CUdeviceptr cumem;
                        size_t cubytes;
 
-                       cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name));
+                       cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
 
                        if(cubytes == 8) {
                                /* 64 bit device pointer */
index c72afa2a3a4088921c382cb8ca2ff8afe55b29ee..14b6738b23e970400e120fabfb02b0c57410a448 100644 (file)
@@ -29,6 +29,21 @@ CCL_NAMESPACE_BEGIN
 
 /* Return position normalized to 0..1 in mesh bounds */
 
+#ifdef __KERNEL_GPU__
+ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
+{
+       float4 r;
+       switch(id) {
+               case 0: r = kernel_tex_image_interp_3d(__tex_image_float3d_000, x, y, z); break;
+               case 1: r = kernel_tex_image_interp_3d(__tex_image_float3d_001, x, y, z); break;
+               case 2: r = kernel_tex_image_interp_3d(__tex_image_float3d_002, x, y, z); break;
+               case 3: r = kernel_tex_image_interp_3d(__tex_image_float3d_003, x, y, z); break;
+               case 4: r = kernel_tex_image_interp_3d(__tex_image_float3d_004, x, y, z); break;
+       }
+       return r;
+}
+#endif  /* __KERNEL_GPU__ */
+
 ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P)
 {
        /* todo: optimize this so it's just a single matrix multiplication when
@@ -50,7 +65,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 {
        float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
-       float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+       float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
 #else
        float4 r;
        if(sd->flag & SD_VOLUME_CUBIC)
@@ -70,7 +85,7 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
 {
        float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
-       float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+       float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
 #else
        float4 r;
        if(sd->flag & SD_VOLUME_CUBIC)
index 9fdd3abfec35bd10ae08c4cf4e10879777ab10ed..193c255610cbbfd6d14c3f97897b9a40866b920b 100644 (file)
@@ -62,6 +62,7 @@ typedef texture<int, 1> texture_int;
 typedef texture<uint4, 1> texture_uint4;
 typedef texture<uchar4, 1> texture_uchar4;
 typedef texture<float4, 2> texture_image_float4;
+typedef texture<float4, 3> texture_image3d_float4;
 typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 /* Macros to handle different memory storage on different devices */
@@ -79,6 +80,7 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 #define kernel_tex_fetch(t, index) t[(index)]
 #endif
 #define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
+#define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
 
 #define kernel_data __data
 
index f545a056cc8ef7b8d87400010e2dc7fc335cd6d3..24cb1c348178372c8f3b4b3e742c9d1afd6a4232 100644 (file)
@@ -79,6 +79,12 @@ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float_002)
 KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float_003)
 KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float_004)
 
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float3d_000)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float3d_001)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float3d_002)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float3d_003)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float3d_004)
+
 /* image */
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_005)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_006)
index 633e1edfb1987759baa3d960d5c44e19889dfb75..9865da2e8cd796d9904c21d0743e8ae49788fbbd 100644 (file)
@@ -447,11 +447,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a
                                svm_node_blackbody(kg, sd, stack, node.y, node.z);
                                break;
 #  endif  /* __EXTRA_NODES__ */
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME) && !defined(__KERNEL_GPU__)
+#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
                        case NODE_TEX_VOXEL:
                                svm_node_tex_voxel(kg, sd, stack, node, &offset);
                                break;
-#  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) && !defined(__KERNEL_GPU__) */
+#  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
 #endif  /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
                        case NODE_END:
                                return;
index caf0b37ba352193c2f2b9c4ca7f6d5fbe276e334..86d3262795fd8199802adc88fa4db201262a3105 100644 (file)
@@ -246,13 +246,13 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
                case 90: r = kernel_tex_image_interp(__tex_image_090, x, y); break;
                case 91: r = kernel_tex_image_interp(__tex_image_091, x, y); break;
                case 92: r = kernel_tex_image_interp(__tex_image_092, x, y); break;
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
                case 93: r = kernel_tex_image_interp(__tex_image_093, x, y); break;
                case 94: r = kernel_tex_image_interp(__tex_image_094, x, y); break;
                case 95: r = kernel_tex_image_interp(__tex_image_095, x, y); break;
                case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break;
                case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
                case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
                case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
                case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
index 31cad5ec88794d261953171d4b2e7ac5b127ce05..af03ce3fe12b4ae1c7663327d145c21770fb0c4d 100644 (file)
@@ -16,8 +16,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if !defined(__KERNEL_GPU__)
-
 /* TODO(sergey): Think of making it more generic volume-type attribute
  * sampler.
  */
@@ -43,13 +41,15 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
                tfm.w = read_node_float(kg, offset);
                co = transform_point(&tfm, co);
        }
+#if defined(__KERNEL_GPU__)
+       float4 r = volume_image_texture_3d(id, co.x, co.y, co.z);
+#else
        float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
+#endif
        if (stack_valid(density_out_offset))
                stack_store_float(stack, density_out_offset, r.w);
        if (stack_valid(color_out_offset))
                stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z));
 }
 
-#endif  /* !defined(__KERNEL_GPU__) */
-
 CCL_NAMESPACE_END
index c79c152afdee9e5a6841555c3989584932347bde..c5561e16cb35e3ad25f3bf478c63e8ab7ff211e2 100644 (file)
@@ -29,7 +29,7 @@
 CCL_NAMESPACE_BEGIN
 
 /* generic */
-#define TEX_NUM_IMAGES                 94
+#define TEX_NUM_IMAGES                 88
 #define TEX_IMAGE_BYTE_START   TEX_NUM_FLOAT_IMAGES
 
 /* extended gpu */