Cycles: Add support for bindless textures.
authorThomas Dinges <blender@dingto.org>
Thu, 19 May 2016 10:47:41 +0000 (12:47 +0200)
committerThomas Dinges <blender@dingto.org>
Thu, 19 May 2016 11:14:37 +0000 (13:14 +0200)
This adds support for CUDA Texture objects (also known as Bindless textures) for Kepler GPUs (Geforce 6xx and above).
This is used for all 2D/3D textures, data still uses arrays as before.

User benefits:
* No more limits of image textures on Kepler.
 We had 5 float4 and 145 byte4 slots there before, now we have 1024 float4 and 1024 byte4.
 This can be extended further if we need to (just change the define).

* Single channel textures slots (byte and float) are now supported on Kepler as well (1024 slots for each type).

ToDo / Issues:
* 3D textures don't work yet, at least don't show up during render. I have no idea whats wrong yet.
* Dynamically allocate bindless_mapping array?

I hope Fermi still works fine, but that should be tested on a Fermi card before pushing to master.

Part of my GSoC 2016.

Reviewers: sergey, #cycles, brecht

Subscribers: swerner, jtheninja, brecht, sergey

Differential Revision: https://developer.blender.org/D1999

intern/cycles/device/device.h
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_multi.cpp
intern/cycles/kernel/geom/geom_volume.h
intern/cycles/kernel/kernel_compat_cuda.h
intern/cycles/kernel/kernel_textures.h
intern/cycles/kernel/svm/svm_image.h
intern/cycles/kernel/svm/svm_voxel.h
intern/cycles/render/image.cpp
intern/cycles/util/util_texture.h

index 4c1b72248379d627541fa473b8ecf5557aa6a920..e11bb7f76af7c64a58fa17405f8dac199d615460 100644 (file)
@@ -54,7 +54,7 @@ public:
        bool display_device;
        bool advanced_shading;
        bool pack_images;
-       bool extended_images; /* flag for GPU and Multi device */
+       bool has_bindless_textures; /* flag for GPU and Multi device */
        bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
        vector<DeviceInfo> multi_devices;
 
@@ -66,7 +66,7 @@ public:
                display_device = false;
                advanced_shading = true;
                pack_images = false;
-               extended_images = false;
+               has_bindless_textures = false;
                use_split_kernel = false;
        }
 };
@@ -230,6 +230,7 @@ public:
                (void)interpolation;  /* Ignored. */
                (void)extension;  /* Ignored. */
        };
+
        virtual void tex_free(device_memory& /*mem*/) {};
 
        /* pixel memory */
index 12c62c0702c61075c8c41acb417fb4019b4ef184..39bb44268263b681bc2d1c3abf7c01e33d024681 100644 (file)
@@ -85,10 +85,10 @@ public:
        CUcontext cuContext;
        CUmodule cuModule;
        map<device_ptr, bool> tex_interp_map;
+       map<device_ptr, uint> tex_bindless_map;
        int cuDevId;
        int cuDevArchitecture;
        bool first_error;
-       bool use_texture_storage;
 
        struct PixelMem {
                GLuint cuPBO;
@@ -99,6 +99,10 @@ public:
 
        map<device_ptr, PixelMem> pixel_mem_map;
 
+       /* Bindless Textures */
+       device_vector<uint> bindless_mapping;
+       bool need_bindless_mapping;
+
        CUdeviceptr cuda_device_ptr(device_ptr mem)
        {
                return (CUdeviceptr)mem;
@@ -176,12 +180,13 @@ public:
        {
                first_error = true;
                background = background_;
-               use_texture_storage = true;
 
                cuDevId = info.num;
                cuDevice = 0;
                cuContext = 0;
 
+               need_bindless_mapping = false;
+
                /* intialize */
                if(cuda_error(cuInit(0)))
                        return;
@@ -211,11 +216,6 @@ public:
                cuDeviceComputeCapability(&major, &minor, cuDevId);
                cuDevArchitecture = major*100 + minor*10;
 
-               /* In order to use full 6GB of memory on Titan cards, use arrays instead
-                * of textures. On earlier cards this seems slower, but on Titan it is
-                * actually slightly faster in tests. */
-               use_texture_storage = (cuDevArchitecture < 300);
-
                cuda_pop_context();
        }
 
@@ -223,6 +223,10 @@ public:
        {
                task_pool.stop();
 
+               if(info.has_bindless_textures) {
+                       tex_free(bindless_mapping);
+               }
+
                cuda_assert(cuCtxDestroy(cuContext));
        }
 
@@ -400,6 +404,15 @@ public:
                return (result == CUDA_SUCCESS);
        }
 
+       void load_bindless_mapping()
+       {
+               if(info.has_bindless_textures && need_bindless_mapping) {
+                       tex_free(bindless_mapping);
+                       tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
+                       need_bindless_mapping = false;
+               }
+       }
+
        void mem_alloc(device_memory& mem, MemoryType /*type*/)
        {
                cuda_push_context();
@@ -479,126 +492,99 @@ public:
        {
                VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
+               /* Check if we are on sm_30 or above.
+                * We use arrays and bindles textures for storage there */
+               bool has_bindless_textures = info.has_bindless_textures;
+
+               /* General variables for both architectures */
                string bind_name = name;
-               if(mem.data_depth > 1) {
-                       /* Kernel uses different bind names for 2d and 3d float textures,
-                        * so we have to adjust couple of things here.
-                        */
-                       vector<string> tokens;
-                       string_split(tokens, name, "_");
-                       bind_name = string_printf("__tex_image_%s_3d_%s",
-                                                 tokens[2].c_str(),
-                                                 tokens[3].c_str());
+               size_t dsize = datatype_size(mem.data_type);
+               size_t size = mem.memory_size();
+
+               CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+               switch(extension) {
+                       case EXTENSION_REPEAT:
+                               address_mode = CU_TR_ADDRESS_MODE_WRAP;
+                               break;
+                       case EXTENSION_EXTEND:
+                               address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+                               break;
+                       case EXTENSION_CLIP:
+                               address_mode = CU_TR_ADDRESS_MODE_BORDER;
+                               break;
+                       default:
+                               assert(0);
+                               break;
+               }
+
+               CUfilter_mode filter_mode;
+               if(interpolation == INTERPOLATION_CLOSEST) {
+                       filter_mode = CU_TR_FILTER_MODE_POINT;
+               }
+               else {
+                       filter_mode = CU_TR_FILTER_MODE_LINEAR;
                }
 
-               /* determine format */
                CUarray_format_enum format;
-               size_t dsize = datatype_size(mem.data_type);
-               size_t size = mem.memory_size();
-               bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;
+               switch(mem.data_type) {
+                       case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+                       case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+                       case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+                       case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+                       default: assert(0); return;
+               }
 
-               if(use_texture) {
+               /* General variables for Fermi */
+               CUtexref texref = NULL;
 
-                       switch(mem.data_type) {
-                               case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
-                               case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
-                               case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
-                               case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
-                               default: assert(0); return;
+               if(!has_bindless_textures) {
+                       if(mem.data_depth > 1) {
+                               /* Kernel uses different bind names for 2d and 3d float textures,
+                                * so we have to adjust couple of things here.
+                                */
+                               vector<string> tokens;
+                               string_split(tokens, name, "_");
+                               bind_name = string_printf("__tex_image_%s_3d_%s",
+                                                         tokens[2].c_str(),
+                                                         tokens[3].c_str());
                        }
 
-                       CUtexref texref = NULL;
-
                        cuda_push_context();
                        cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
+                       cuda_pop_context();
 
                        if(!texref) {
-                               cuda_pop_context();
                                return;
                        }
+               }
 
-                       if(interpolation != INTERPOLATION_NONE) {
-                               CUarray handle = NULL;
-
-                               if(mem.data_depth > 1) {
-                                       CUDA_ARRAY3D_DESCRIPTOR desc;
-
-                                       desc.Width = mem.data_width;
-                                       desc.Height = mem.data_height;
-                                       desc.Depth = mem.data_depth;
-                                       desc.Format = format;
-                                       desc.NumChannels = mem.data_elements;
-                                       desc.Flags = 0;
-
-                                       cuda_assert(cuArray3DCreate(&handle, &desc));
-                               }
-                               else {
-                                       CUDA_ARRAY_DESCRIPTOR desc;
-
-                                       desc.Width = mem.data_width;
-                                       desc.Height = mem.data_height;
-                                       desc.Format = format;
-                                       desc.NumChannels = mem.data_elements;
-
-                                       cuda_assert(cuArrayCreate(&handle, &desc));
-                               }
+               /* Data Storage */
+               if(interpolation == INTERPOLATION_NONE) {
+                       if(has_bindless_textures) {
+                               mem_alloc(mem, MEM_READ_ONLY);
+                               mem_copy_to(mem);
 
-                               if(!handle) {
-                                       cuda_pop_context();
-                                       return;
-                               }
+                               cuda_push_context();
 
-                               if(mem.data_depth > 1) {
-                                       CUDA_MEMCPY3D param;
-                                       memset(&param, 0, sizeof(param));
-                                       param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                                       param.dstArray = handle;
-                                       param.srcMemoryType = CU_MEMORYTYPE_HOST;
-                                       param.srcHost = (void*)mem.data_pointer;
-                                       param.srcPitch = mem.data_width*dsize*mem.data_elements;
-                                       param.WidthInBytes = param.srcPitch;
-                                       param.Height = mem.data_height;
-                                       param.Depth = mem.data_depth;
-
-                                       cuda_assert(cuMemcpy3D(&param));
-                               }
-                               else if(mem.data_height > 1) {
-                                       CUDA_MEMCPY2D param;
-                                       memset(&param, 0, sizeof(param));
-                                       param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                                       param.dstArray = handle;
-                                       param.srcMemoryType = CU_MEMORYTYPE_HOST;
-                                       param.srcHost = (void*)mem.data_pointer;
-                                       param.srcPitch = mem.data_width*dsize*mem.data_elements;
-                                       param.WidthInBytes = param.srcPitch;
-                                       param.Height = mem.data_height;
-
-                                       cuda_assert(cuMemcpy2D(&param));
-                               }
-                               else
-                                       cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
+                               CUdeviceptr cumem;
+                               size_t cubytes;
 
-                               cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+                               cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
 
-                               if(interpolation == INTERPOLATION_CLOSEST) {
-                                       cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
+                               if(cubytes == 8) {
+                                       /* 64 bit device pointer */
+                                       uint64_t ptr = mem.device_pointer;
+                                       cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
                                }
-                               else if(interpolation == INTERPOLATION_LINEAR) {
-                                       cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
-                               }
-                               else {/* CUBIC and SMART are unsupported for CUDA */
-                                       cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
+                               else {
+                                       /* 32 bit device pointer */
+                                       uint32_t ptr = (uint32_t)mem.device_pointer;
+                                       cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
                                }
-                               cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
-
-                               mem.device_pointer = (device_ptr)handle;
-                               mem.device_size = size;
 
-                               stats.mem_alloc(size);
+                               cuda_pop_context();
                        }
                        else {
-                               cuda_pop_context();
-
                                mem_alloc(mem, MEM_READ_ONLY);
                                mem_copy_to(mem);
 
@@ -607,58 +593,149 @@ public:
                                cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
                                cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
                                cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
+
+                               cuda_pop_context();
                        }
+               }
+               /* Texture Storage */
+               else {
+                       CUarray handle = NULL;
 
-                       CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-                       switch(extension) {
-                               case EXTENSION_REPEAT:
-                                       address_mode = CU_TR_ADDRESS_MODE_WRAP;
-                                       break;
-                               case EXTENSION_EXTEND:
-                                       address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-                                       break;
-                               case EXTENSION_CLIP:
-                                       address_mode = CU_TR_ADDRESS_MODE_BORDER;
-                                       break;
-                               default:
-                                       assert(0);
-                                       break;
+                       cuda_push_context();
+
+                       if(mem.data_depth > 1) {
+                               CUDA_ARRAY3D_DESCRIPTOR desc;
+
+                               desc.Width = mem.data_width;
+                               desc.Height = mem.data_height;
+                               desc.Depth = mem.data_depth;
+                               desc.Format = format;
+                               desc.NumChannels = mem.data_elements;
+                               desc.Flags = 0;
+
+                               cuda_assert(cuArray3DCreate(&handle, &desc));
                        }
-                       cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
-                       cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
+                       else {
+                               CUDA_ARRAY_DESCRIPTOR desc;
+
+                               desc.Width = mem.data_width;
+                               desc.Height = mem.data_height;
+                               desc.Format = format;
+                               desc.NumChannels = mem.data_elements;
+
+                               cuda_assert(cuArrayCreate(&handle, &desc));
+                       }
+
+                       if(!handle) {
+                               cuda_pop_context();
+                               return;
+                       }
+
+                       /* Allocate 3D, 2D or 1D memory */
                        if(mem.data_depth > 1) {
-                               cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
+                               CUDA_MEMCPY3D param;
+                               memset(&param, 0, sizeof(param));
+                               param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                               param.dstArray = handle;
+                               param.srcMemoryType = CU_MEMORYTYPE_HOST;
+                               param.srcHost = (void*)mem.data_pointer;
+                               param.srcPitch = mem.data_width*dsize*mem.data_elements;
+                               param.WidthInBytes = param.srcPitch;
+                               param.Height = mem.data_height;
+                               param.Depth = mem.data_depth;
+
+                               cuda_assert(cuMemcpy3D(&param));
                        }
+                       else if(mem.data_height > 1) {
+                               CUDA_MEMCPY2D param;
+                               memset(&param, 0, sizeof(param));
+                               param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                               param.dstArray = handle;
+                               param.srcMemoryType = CU_MEMORYTYPE_HOST;
+                               param.srcHost = (void*)mem.data_pointer;
+                               param.srcPitch = mem.data_width*dsize*mem.data_elements;
+                               param.WidthInBytes = param.srcPitch;
+                               param.Height = mem.data_height;
+
+                               cuda_assert(cuMemcpy2D(&param));
+                       }
+                       else
+                               cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
 
-                       cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
+                       /* Fermi and Kepler */
+                       mem.device_pointer = (device_ptr)handle;
+                       mem.device_size = size;
 
-                       cuda_pop_context();
-               }
-               else {
-                       mem_alloc(mem, MEM_READ_ONLY);
-                       mem_copy_to(mem);
+                       stats.mem_alloc(size);
 
-                       cuda_push_context();
+                       /* Bindless Textures - Kepler */
+                       if(has_bindless_textures) {
+                               int flat_slot = 0;
+                               if(string_startswith(name, "__tex_image")) {
+                                       int pos =  string(name).rfind("_");
+                                       flat_slot = atoi(name + pos + 1);
+                               }
+                               else {
+                                       assert(0);
+                               }
 
-                       CUdeviceptr cumem;
-                       size_t cubytes;
+                               CUDA_RESOURCE_DESC resDesc;
+                               memset(&resDesc, 0, sizeof(resDesc));
+                               resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+                               resDesc.res.array.hArray = handle;
+                               resDesc.flags = 0;
+
+                               CUDA_TEXTURE_DESC texDesc;
+                               memset(&texDesc, 0, sizeof(texDesc));
+                               texDesc.addressMode[0] = address_mode;
+                               texDesc.addressMode[1] = address_mode;
+                               texDesc.addressMode[2] = address_mode;
+                               texDesc.filterMode = filter_mode;
+                               texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+                               CUtexObject tex = 0;
+                               cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
+
+                               /* Safety check */
+                               if((uint)tex > UINT_MAX) {
+                                       assert(0);
+                               }
 
-                       cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+                               /* Resize once */
+                               if(flat_slot >= bindless_mapping.size())
+                                       bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */
 
-                       if(cubytes == 8) {
-                               /* 64 bit device pointer */
-                               uint64_t ptr = mem.device_pointer;
-                               cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
+                               /* Set Mapping and tag that we need to (re-)upload to device */
+                               bindless_mapping.get_data()[flat_slot] = (uint)tex;
+                               tex_bindless_map[mem.device_pointer] = (uint)tex;
+                               need_bindless_mapping = true;
                        }
+                       /* Regular Textures - Fermi */
                        else {
-                               /* 32 bit device pointer */
-                               uint32_t ptr = (uint32_t)mem.device_pointer;
-                               cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
+                               cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+                               cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
+                               cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
+                       }
+
+                       cuda_pop_context();
+               }
+
+               /* Fermi, Data and Image Textures */
+               if(!has_bindless_textures) {
+                       cuda_push_context();
+
+                       cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
+                       cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
+                       if(mem.data_depth > 1) {
+                               cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
                        }
 
+                       cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
+
                        cuda_pop_context();
                }
 
+               /* Fermi and Kepler */
                tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
        }
 
@@ -670,6 +747,12 @@ public:
                                cuArrayDestroy((CUarray)mem.device_pointer);
                                cuda_pop_context();
 
+                               /* Free CUtexObject (Bindless Textures) */
+                               if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
+                                       uint flat_slot = tex_bindless_map[mem.device_pointer];
+                                       cuTexObjectDestroy(flat_slot);
+                               }
+
                                tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
                                mem.device_pointer = 0;
 
@@ -1111,6 +1194,9 @@ public:
                        RenderTile tile;
                        
                        bool branched = task->integrator_branched;
+
+                       /* Upload Bindless Mapping */
+                       load_bindless_mapping();
                        
                        /* keep rendering tiles until done */
                        while(task->acquire_tile(this, tile)) {
@@ -1134,6 +1220,9 @@ public:
                        }
                }
                else if(task->type == DeviceTask::SHADER) {
+                       /* Upload Bindless Mapping */
+                       load_bindless_mapping();
+
                        shader(*task);
 
                        cuda_push_context();
@@ -1269,7 +1358,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
                info.num = num;
 
                info.advanced_shading = (major >= 2);
-               info.extended_images = (major >= 3);
+               info.has_bindless_textures = (major >= 3);
                info.pack_images = false;
 
                /* if device has a kernel timeout, assume it is used for display */
index 6141f9af50f80a53a694b5d1af1e1f0b24e91937..434d0085d39333377d0c1ec94769794c783a1e90 100644 (file)
@@ -352,7 +352,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
 
        info.advanced_shading = with_advanced_shading;
        info.pack_images = false;
-       info.extended_images = true;
+       info.has_bindless_textures = true;
 
        foreach(DeviceInfo& subinfo, devices) {
                if(subinfo.type == type) {
@@ -376,7 +376,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
                        if(subinfo.display_device)
                                info.display_device = true;
                        info.pack_images = info.pack_images || subinfo.pack_images;
-                       info.extended_images = info.extended_images && subinfo.extended_images;
+                       info.has_bindless_textures = info.has_bindless_textures && subinfo.has_bindless_textures;
                        num_added++;
                }
        }
index ef02c01dec6a12025b82f920e9ab9440362061c0..2044aafc877ac68ab750b4dbec16d5e7631a5024 100644 (file)
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Return position normalized to 0..1 in mesh bounds */
 
-#ifdef __KERNEL_GPU__
+#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ < 300
 ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
 {
        float4 r;
@@ -65,7 +65,13 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 {
        float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
+#  if __CUDA_ARCH__ >= 300
+       CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+       float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
+       float4 r = make_float4(f, f, f, 1.0);
+#  else
        float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+#  endif
 #else
        float4 r;
        if(sd->flag & SD_VOLUME_CUBIC)
@@ -84,7 +90,12 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
 {
        float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
+#  if __CUDA_ARCH__ >= 300
+       CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+       float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
+#  else
        float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+#  endif
 #else
        float4 r;
        if(sd->flag & SD_VOLUME_CUBIC)
index d10d3255e1b34f673c0ecad5595c2ab7d9aa4927..42314756f028cb44bc705ccdaa645f59646c95e2 100644 (file)
@@ -67,20 +67,29 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 /* Macros to handle different memory storage on different devices */
 
-/* In order to use full 6GB of memory on Titan cards, use arrays instead
- * of textures. On earlier cards this seems slower, but on Titan it is
- * actually slightly faster in tests. */
+/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
+ * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
+ *
+ * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
+ * Using Arrays on Fermi turned out to be slower.*/
+
+/* Fermi */
 #if __CUDA_ARCH__ < 300
 #  define __KERNEL_CUDA_TEX_STORAGE__
-#endif
-
-#ifdef __KERNEL_CUDA_TEX_STORAGE__
 #  define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
+
+#  define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
+#  define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
+
+/* Kepler */
 #else
 #  define kernel_tex_fetch(t, index) t[(index)]
+
+#  define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
+#  define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
+#  define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
+#  define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
 #endif
-#define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
-#define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
 
 #define kernel_data __data
 
index 62b0a6f2923799f4724659b15f674fdbae03cd37..245d236ff973bee3c90f1e5163e7aad212c6b78c 100644 (file)
@@ -72,6 +72,8 @@ KERNEL_TEX(float, texture_float, __lookup_table)
 /* sobol */
 KERNEL_TEX(uint, texture_uint, __sobol_directions)
 
+#ifdef __KERNEL_CUDA__
+#  if __CUDA_ARCH__ < 300
 /* full-float image */
 KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
 KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
@@ -174,66 +176,12 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_090)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_091)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_092)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_093)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_094)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_095)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_096)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_098)
-
-/* Kepler and above */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_099)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_100)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_101)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_102)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_103)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_104)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_106)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_107)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_108)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_109)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_110)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_111)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_112)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_114)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_115)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_116)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_117)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_118)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_119)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_120)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_122)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_123)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_124)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_125)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_126)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_127)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_128)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_130)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_131)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_132)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_133)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_134)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_135)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_136)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_138)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_139)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_140)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_141)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_142)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_143)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_144)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_146)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_147)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_148)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_149)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_150)
+
+#  else
+/* bindless textures */
+KERNEL_TEX(uint, texture_uint, __bindless_mapping)
+#  endif
+#endif
 
 /* packed image (opencl) */
 KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
index faff4ce3e6d935044df4cd0181036e1051f2b248..92d2b36bbb198687f1680aa71d72c4b7ba311484 100644 (file)
@@ -18,11 +18,15 @@ CCL_NAMESPACE_BEGIN
 
 /* Float4 textures on various devices. */
 #if defined(__KERNEL_CPU__)
-  #define TEX_NUM_FLOAT4_IMAGES        TEX_NUM_FLOAT4_IMAGES_CPU
+#  define TEX_NUM_FLOAT4_IMAGES        TEX_NUM_FLOAT4_IMAGES_CPU
 #elif defined(__KERNEL_CUDA__)
-  #define TEX_NUM_FLOAT4_IMAGES        TEX_NUM_FLOAT4_IMAGES_CUDA
+#  if __CUDA_ARCH__ < 300
+#    define TEX_NUM_FLOAT4_IMAGES      TEX_NUM_FLOAT4_IMAGES_CUDA
+#  else
+#    define TEX_NUM_FLOAT4_IMAGES      TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER
+#  endif
 #else
-  #define TEX_NUM_FLOAT4_IMAGES        TEX_NUM_FLOAT4_IMAGES_OPENCL
+#  define TEX_NUM_FLOAT4_IMAGES        TEX_NUM_FLOAT4_IMAGES_OPENCL
 #endif
 
 #ifdef __KERNEL_OPENCL__
@@ -151,6 +155,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 #else
        float4 r;
 
+#  if __CUDA_ARCH__ < 300
        /* not particularly proud of this massive switch, what are the
         * alternatives?
         * - use a single big 1D texture, and do our own lookup/filtering
@@ -254,72 +259,19 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
                case 90: r = kernel_tex_image_interp(__tex_image_byte4_090, x, y); break;
                case 91: r = kernel_tex_image_interp(__tex_image_byte4_091, x, y); break;
                case 92: r = kernel_tex_image_interp(__tex_image_byte4_092, x, y); break;
-
-#  if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
-               case 93: r = kernel_tex_image_interp(__tex_image_byte4_093, x, y); break;
-               case 94: r = kernel_tex_image_interp(__tex_image_byte4_094, x, y); break;
-               case 95: r = kernel_tex_image_interp(__tex_image_byte4_095, x, y); break;
-               case 96: r = kernel_tex_image_interp(__tex_image_byte4_096, x, y); break;
-               case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
-               case 98: r = kernel_tex_image_interp(__tex_image_byte4_098, x, y); break;
-               case 99: r = kernel_tex_image_interp(__tex_image_byte4_099, x, y); break;
-               case 100: r = kernel_tex_image_interp(__tex_image_byte4_100, x, y); break;
-               case 101: r = kernel_tex_image_interp(__tex_image_byte4_101, x, y); break;
-               case 102: r = kernel_tex_image_interp(__tex_image_byte4_102, x, y); break;
-               case 103: r = kernel_tex_image_interp(__tex_image_byte4_103, x, y); break;
-               case 104: r = kernel_tex_image_interp(__tex_image_byte4_104, x, y); break;
-               case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
-               case 106: r = kernel_tex_image_interp(__tex_image_byte4_106, x, y); break;
-               case 107: r = kernel_tex_image_interp(__tex_image_byte4_107, x, y); break;
-               case 108: r = kernel_tex_image_interp(__tex_image_byte4_108, x, y); break;
-               case 109: r = kernel_tex_image_interp(__tex_image_byte4_109, x, y); break;
-               case 110: r = kernel_tex_image_interp(__tex_image_byte4_110, x, y); break;
-               case 111: r = kernel_tex_image_interp(__tex_image_byte4_111, x, y); break;
-               case 112: r = kernel_tex_image_interp(__tex_image_byte4_112, x, y); break;
-               case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
-               case 114: r = kernel_tex_image_interp(__tex_image_byte4_114, x, y); break;
-               case 115: r = kernel_tex_image_interp(__tex_image_byte4_115, x, y); break;
-               case 116: r = kernel_tex_image_interp(__tex_image_byte4_116, x, y); break;
-               case 117: r = kernel_tex_image_interp(__tex_image_byte4_117, x, y); break;
-               case 118: r = kernel_tex_image_interp(__tex_image_byte4_118, x, y); break;
-               case 119: r = kernel_tex_image_interp(__tex_image_byte4_119, x, y); break;
-               case 120: r = kernel_tex_image_interp(__tex_image_byte4_120, x, y); break;
-               case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
-               case 122: r = kernel_tex_image_interp(__tex_image_byte4_122, x, y); break;
-               case 123: r = kernel_tex_image_interp(__tex_image_byte4_123, x, y); break;
-               case 124: r = kernel_tex_image_interp(__tex_image_byte4_124, x, y); break;
-               case 125: r = kernel_tex_image_interp(__tex_image_byte4_125, x, y); break;
-               case 126: r = kernel_tex_image_interp(__tex_image_byte4_126, x, y); break;
-               case 127: r = kernel_tex_image_interp(__tex_image_byte4_127, x, y); break;
-               case 128: r = kernel_tex_image_interp(__tex_image_byte4_128, x, y); break;
-               case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
-               case 130: r = kernel_tex_image_interp(__tex_image_byte4_130, x, y); break;
-               case 131: r = kernel_tex_image_interp(__tex_image_byte4_131, x, y); break;
-               case 132: r = kernel_tex_image_interp(__tex_image_byte4_132, x, y); break;
-               case 133: r = kernel_tex_image_interp(__tex_image_byte4_133, x, y); break;
-               case 134: r = kernel_tex_image_interp(__tex_image_byte4_134, x, y); break;
-               case 135: r = kernel_tex_image_interp(__tex_image_byte4_135, x, y); break;
-               case 136: r = kernel_tex_image_interp(__tex_image_byte4_136, x, y); break;
-               case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
-               case 138: r = kernel_tex_image_interp(__tex_image_byte4_138, x, y); break;
-               case 139: r = kernel_tex_image_interp(__tex_image_byte4_139, x, y); break;
-               case 140: r = kernel_tex_image_interp(__tex_image_byte4_140, x, y); break;
-               case 141: r = kernel_tex_image_interp(__tex_image_byte4_141, x, y); break;
-               case 142: r = kernel_tex_image_interp(__tex_image_byte4_142, x, y); break;
-               case 143: r = kernel_tex_image_interp(__tex_image_byte4_143, x, y); break;
-               case 144: r = kernel_tex_image_interp(__tex_image_byte4_144, x, y); break;
-               case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
-               case 146: r = kernel_tex_image_interp(__tex_image_byte4_146, x, y); break;
-               case 147: r = kernel_tex_image_interp(__tex_image_byte4_147, x, y); break;
-               case 148: r = kernel_tex_image_interp(__tex_image_byte4_148, x, y); break;
-               case 149: r = kernel_tex_image_interp(__tex_image_byte4_149, x, y); break;
-               case 150: r = kernel_tex_image_interp(__tex_image_byte4_150, x, y); break;
-#  endif
-
                default:
                        kernel_assert(0);
                        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        }
+#  else
+       CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+       if(id < 2048) /* TODO(dingto): Make this a variable */
+               r = kernel_tex_image_interp_float4(tex, x, y);
+       else {
+               float f = kernel_tex_image_interp_float(tex, x, y);
+               r = make_float4(f, f, f, 1.0);
+       }
+#  endif
 #endif
 
 #ifdef __KERNEL_SSE2__
index 85ba2f906fa744bc57b55f35f4c52683845fb3d8..d2cc2c3730e1ade25d75ec665f29234bd86c606d 100644 (file)
@@ -42,10 +42,21 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
                tfm.w = read_node_float(kg, offset);
                co = transform_point(&tfm, co);
        }
+       float4 r;
 #  if defined(__KERNEL_GPU__)
-       float4 r = volume_image_texture_3d(id, co.x, co.y, co.z);
-#  else
-       float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
+#    if __CUDA_ARCH__ >= 300
+       CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+       if(id < 2048) /* TODO(dingto): Make this a variable */
+               r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
+       else {
+               float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
+               r = make_float4(f, f, f, 1.0);
+       }
+#    else /* __CUDA_ARCH__ >= 300 */
+       r = volume_image_texture_3d(id, co.x, co.y, co.z);
+#    endif
+#  else /* __KERNEL_GPU__ */
+       r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
 #  endif
 #else
        float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
index 898e00fdcd94c370de8a52ec590b8e6b2c060753..9f40e561f59902c6bbe6d586e89d6bea7e671a9a 100644 (file)
@@ -49,7 +49,7 @@ ImageManager::ImageManager(const DeviceInfo& info)
                tex_image_byte_start = TEX_IMAGE_BYTE_START_CPU;
        }
        /* CUDA (Fermi) */
-       else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && !info.extended_images) {
+       else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && !info.has_bindless_textures) {
                tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA;
                tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA;
                tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA;
@@ -59,7 +59,7 @@ ImageManager::ImageManager(const DeviceInfo& info)
                tex_image_byte_start = TEX_IMAGE_BYTE_START_CUDA;
        }
        /* CUDA (Kepler and above) */
-       else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.extended_images) {
+       else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.has_bindless_textures) {
                tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER;
                tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER;
                tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER;
@@ -294,7 +294,7 @@ int ImageManager::add_image(const string& filename,
        if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
                is_float = true;
 
-       /* No float and byte textures on GPU yet */
+       /* No single channel textures on Fermi GPUs, use available slots */
        if(type == IMAGE_DATA_TYPE_FLOAT && tex_num_images[type] == 0)
                type = IMAGE_DATA_TYPE_FLOAT4;
        if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0)
index 2a1cfca4fdd5063c5557acff8bb5e539106b62f1..6da478581334bf58aaf4d06862ce3d3bbfd5c8e6 100644 (file)
@@ -40,10 +40,10 @@ CCL_NAMESPACE_BEGIN
 #define TEX_IMAGE_BYTE_START_CUDA      (TEX_NUM_FLOAT4_IMAGES_CUDA + TEX_NUM_BYTE4_IMAGES_CUDA + TEX_NUM_BYTE_IMAGES_CUDA)
 
 /* CUDA (KEPLER and above) */
-#define TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER       145
-#define TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER      5
-#define TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER       0
-#define TEX_NUM_BYTE_IMAGES_CUDA_KEPLER                0
+#define TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER       1024
+#define TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER      1024
+#define TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER       1024
+#define TEX_NUM_BYTE_IMAGES_CUDA_KEPLER                1024
 #define TEX_IMAGE_BYTE4_START_CUDA_KEPLER      TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER
 #define TEX_IMAGE_FLOAT_START_CUDA_KEPLER      (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER)
 #define TEX_IMAGE_BYTE_START_CUDA_KEPLER       (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE_IMAGES_CUDA_KEPLER)