Cycles: Add support for bindless textures.
[blender.git] / intern / cycles / device / device_cuda.cpp
index 12c62c0702c61075c8c41acb417fb4019b4ef184..39bb44268263b681bc2d1c3abf7c01e33d024681 100644 (file)
@@ -85,10 +85,10 @@ public:
        CUcontext cuContext;
        CUmodule cuModule;
        map<device_ptr, bool> tex_interp_map;
+       map<device_ptr, uint> tex_bindless_map;
        int cuDevId;
        int cuDevArchitecture;
        bool first_error;
-       bool use_texture_storage;
 
        struct PixelMem {
                GLuint cuPBO;
@@ -99,6 +99,10 @@ public:
 
        map<device_ptr, PixelMem> pixel_mem_map;
 
+       /* Bindless Textures */
+       device_vector<uint> bindless_mapping;
+       bool need_bindless_mapping;
+
        CUdeviceptr cuda_device_ptr(device_ptr mem)
        {
                return (CUdeviceptr)mem;
@@ -176,12 +180,13 @@ public:
        {
                first_error = true;
                background = background_;
-               use_texture_storage = true;
 
                cuDevId = info.num;
                cuDevice = 0;
                cuContext = 0;
 
+               need_bindless_mapping = false;
+
                /* intialize */
                if(cuda_error(cuInit(0)))
                        return;
@@ -211,11 +216,6 @@ public:
                cuDeviceComputeCapability(&major, &minor, cuDevId);
                cuDevArchitecture = major*100 + minor*10;
 
-               /* In order to use full 6GB of memory on Titan cards, use arrays instead
-                * of textures. On earlier cards this seems slower, but on Titan it is
-                * actually slightly faster in tests. */
-               use_texture_storage = (cuDevArchitecture < 300);
-
                cuda_pop_context();
        }
 
@@ -223,6 +223,10 @@ public:
        {
                task_pool.stop();
 
+               if(info.has_bindless_textures) {
+                       tex_free(bindless_mapping);
+               }
+
                cuda_assert(cuCtxDestroy(cuContext));
        }
 
@@ -400,6 +404,15 @@ public:
                return (result == CUDA_SUCCESS);
        }
 
+       void load_bindless_mapping()
+       {
+               if(info.has_bindless_textures && need_bindless_mapping) {
+                       tex_free(bindless_mapping);
+                       tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
+                       need_bindless_mapping = false;
+               }
+       }
+
        void mem_alloc(device_memory& mem, MemoryType /*type*/)
        {
                cuda_push_context();
@@ -479,126 +492,99 @@ public:
        {
                VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
 
+               /* Check if we are on sm_30 or above.
+                * We use arrays and bindles textures for storage there */
+               bool has_bindless_textures = info.has_bindless_textures;
+
+               /* General variables for both architectures */
                string bind_name = name;
-               if(mem.data_depth > 1) {
-                       /* Kernel uses different bind names for 2d and 3d float textures,
-                        * so we have to adjust couple of things here.
-                        */
-                       vector<string> tokens;
-                       string_split(tokens, name, "_");
-                       bind_name = string_printf("__tex_image_%s_3d_%s",
-                                                 tokens[2].c_str(),
-                                                 tokens[3].c_str());
+               size_t dsize = datatype_size(mem.data_type);
+               size_t size = mem.memory_size();
+
+               CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+               switch(extension) {
+                       case EXTENSION_REPEAT:
+                               address_mode = CU_TR_ADDRESS_MODE_WRAP;
+                               break;
+                       case EXTENSION_EXTEND:
+                               address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+                               break;
+                       case EXTENSION_CLIP:
+                               address_mode = CU_TR_ADDRESS_MODE_BORDER;
+                               break;
+                       default:
+                               assert(0);
+                               break;
+               }
+
+               CUfilter_mode filter_mode;
+               if(interpolation == INTERPOLATION_CLOSEST) {
+                       filter_mode = CU_TR_FILTER_MODE_POINT;
+               }
+               else {
+                       filter_mode = CU_TR_FILTER_MODE_LINEAR;
                }
 
-               /* determine format */
                CUarray_format_enum format;
-               size_t dsize = datatype_size(mem.data_type);
-               size_t size = mem.memory_size();
-               bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;
+               switch(mem.data_type) {
+                       case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+                       case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+                       case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+                       case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+                       default: assert(0); return;
+               }
 
-               if(use_texture) {
+               /* General variables for Fermi */
+               CUtexref texref = NULL;
 
-                       switch(mem.data_type) {
-                               case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
-                               case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
-                               case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
-                               case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
-                               default: assert(0); return;
+               if(!has_bindless_textures) {
+                       if(mem.data_depth > 1) {
+                               /* Kernel uses different bind names for 2d and 3d float textures,
+                                * so we have to adjust couple of things here.
+                                */
+                               vector<string> tokens;
+                               string_split(tokens, name, "_");
+                               bind_name = string_printf("__tex_image_%s_3d_%s",
+                                                         tokens[2].c_str(),
+                                                         tokens[3].c_str());
                        }
 
-                       CUtexref texref = NULL;
-
                        cuda_push_context();
                        cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
+                       cuda_pop_context();
 
                        if(!texref) {
-                               cuda_pop_context();
                                return;
                        }
+               }
 
-                       if(interpolation != INTERPOLATION_NONE) {
-                               CUarray handle = NULL;
-
-                               if(mem.data_depth > 1) {
-                                       CUDA_ARRAY3D_DESCRIPTOR desc;
-
-                                       desc.Width = mem.data_width;
-                                       desc.Height = mem.data_height;
-                                       desc.Depth = mem.data_depth;
-                                       desc.Format = format;
-                                       desc.NumChannels = mem.data_elements;
-                                       desc.Flags = 0;
-
-                                       cuda_assert(cuArray3DCreate(&handle, &desc));
-                               }
-                               else {
-                                       CUDA_ARRAY_DESCRIPTOR desc;
-
-                                       desc.Width = mem.data_width;
-                                       desc.Height = mem.data_height;
-                                       desc.Format = format;
-                                       desc.NumChannels = mem.data_elements;
-
-                                       cuda_assert(cuArrayCreate(&handle, &desc));
-                               }
+               /* Data Storage */
+               if(interpolation == INTERPOLATION_NONE) {
+                       if(has_bindless_textures) {
+                               mem_alloc(mem, MEM_READ_ONLY);
+                               mem_copy_to(mem);
 
-                               if(!handle) {
-                                       cuda_pop_context();
-                                       return;
-                               }
+                               cuda_push_context();
 
-                               if(mem.data_depth > 1) {
-                                       CUDA_MEMCPY3D param;
-                                       memset(&param, 0, sizeof(param));
-                                       param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                                       param.dstArray = handle;
-                                       param.srcMemoryType = CU_MEMORYTYPE_HOST;
-                                       param.srcHost = (void*)mem.data_pointer;
-                                       param.srcPitch = mem.data_width*dsize*mem.data_elements;
-                                       param.WidthInBytes = param.srcPitch;
-                                       param.Height = mem.data_height;
-                                       param.Depth = mem.data_depth;
-
-                                       cuda_assert(cuMemcpy3D(&param));
-                               }
-                               else if(mem.data_height > 1) {
-                                       CUDA_MEMCPY2D param;
-                                       memset(&param, 0, sizeof(param));
-                                       param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                                       param.dstArray = handle;
-                                       param.srcMemoryType = CU_MEMORYTYPE_HOST;
-                                       param.srcHost = (void*)mem.data_pointer;
-                                       param.srcPitch = mem.data_width*dsize*mem.data_elements;
-                                       param.WidthInBytes = param.srcPitch;
-                                       param.Height = mem.data_height;
-
-                                       cuda_assert(cuMemcpy2D(&param));
-                               }
-                               else
-                                       cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
+                               CUdeviceptr cumem;
+                               size_t cubytes;
 
-                               cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+                               cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
 
-                               if(interpolation == INTERPOLATION_CLOSEST) {
-                                       cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
+                               if(cubytes == 8) {
+                                       /* 64 bit device pointer */
+                                       uint64_t ptr = mem.device_pointer;
+                                       cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
                                }
-                               else if(interpolation == INTERPOLATION_LINEAR) {
-                                       cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
-                               }
-                               else {/* CUBIC and SMART are unsupported for CUDA */
-                                       cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
+                               else {
+                                       /* 32 bit device pointer */
+                                       uint32_t ptr = (uint32_t)mem.device_pointer;
+                                       cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
                                }
-                               cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
-
-                               mem.device_pointer = (device_ptr)handle;
-                               mem.device_size = size;
 
-                               stats.mem_alloc(size);
+                               cuda_pop_context();
                        }
                        else {
-                               cuda_pop_context();
-
                                mem_alloc(mem, MEM_READ_ONLY);
                                mem_copy_to(mem);
 
@@ -607,58 +593,149 @@ public:
                                cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
                                cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
                                cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
+
+                               cuda_pop_context();
                        }
+               }
+               /* Texture Storage */
+               else {
+                       CUarray handle = NULL;
 
-                       CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-                       switch(extension) {
-                               case EXTENSION_REPEAT:
-                                       address_mode = CU_TR_ADDRESS_MODE_WRAP;
-                                       break;
-                               case EXTENSION_EXTEND:
-                                       address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-                                       break;
-                               case EXTENSION_CLIP:
-                                       address_mode = CU_TR_ADDRESS_MODE_BORDER;
-                                       break;
-                               default:
-                                       assert(0);
-                                       break;
+                       cuda_push_context();
+
+                       if(mem.data_depth > 1) {
+                               CUDA_ARRAY3D_DESCRIPTOR desc;
+
+                               desc.Width = mem.data_width;
+                               desc.Height = mem.data_height;
+                               desc.Depth = mem.data_depth;
+                               desc.Format = format;
+                               desc.NumChannels = mem.data_elements;
+                               desc.Flags = 0;
+
+                               cuda_assert(cuArray3DCreate(&handle, &desc));
                        }
-                       cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
-                       cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
+                       else {
+                               CUDA_ARRAY_DESCRIPTOR desc;
+
+                               desc.Width = mem.data_width;
+                               desc.Height = mem.data_height;
+                               desc.Format = format;
+                               desc.NumChannels = mem.data_elements;
+
+                               cuda_assert(cuArrayCreate(&handle, &desc));
+                       }
+
+                       if(!handle) {
+                               cuda_pop_context();
+                               return;
+                       }
+
+                       /* Allocate 3D, 2D or 1D memory */
                        if(mem.data_depth > 1) {
-                               cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
+                               CUDA_MEMCPY3D param;
+                               memset(&param, 0, sizeof(param));
+                               param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                               param.dstArray = handle;
+                               param.srcMemoryType = CU_MEMORYTYPE_HOST;
+                               param.srcHost = (void*)mem.data_pointer;
+                               param.srcPitch = mem.data_width*dsize*mem.data_elements;
+                               param.WidthInBytes = param.srcPitch;
+                               param.Height = mem.data_height;
+                               param.Depth = mem.data_depth;
+
+                               cuda_assert(cuMemcpy3D(&param));
                        }
+                       else if(mem.data_height > 1) {
+                               CUDA_MEMCPY2D param;
+                               memset(&param, 0, sizeof(param));
+                               param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                               param.dstArray = handle;
+                               param.srcMemoryType = CU_MEMORYTYPE_HOST;
+                               param.srcHost = (void*)mem.data_pointer;
+                               param.srcPitch = mem.data_width*dsize*mem.data_elements;
+                               param.WidthInBytes = param.srcPitch;
+                               param.Height = mem.data_height;
+
+                               cuda_assert(cuMemcpy2D(&param));
+                       }
+                       else
+                               cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
 
-                       cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
+                       /* Fermi and Kepler */
+                       mem.device_pointer = (device_ptr)handle;
+                       mem.device_size = size;
 
-                       cuda_pop_context();
-               }
-               else {
-                       mem_alloc(mem, MEM_READ_ONLY);
-                       mem_copy_to(mem);
+                       stats.mem_alloc(size);
 
-                       cuda_push_context();
+                       /* Bindless Textures - Kepler */
+                       if(has_bindless_textures) {
+                               int flat_slot = 0;
+                               if(string_startswith(name, "__tex_image")) {
+                                       int pos =  string(name).rfind("_");
+                                       flat_slot = atoi(name + pos + 1);
+                               }
+                               else {
+                                       assert(0);
+                               }
 
-                       CUdeviceptr cumem;
-                       size_t cubytes;
+                               CUDA_RESOURCE_DESC resDesc;
+                               memset(&resDesc, 0, sizeof(resDesc));
+                               resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+                               resDesc.res.array.hArray = handle;
+                               resDesc.flags = 0;
+
+                               CUDA_TEXTURE_DESC texDesc;
+                               memset(&texDesc, 0, sizeof(texDesc));
+                               texDesc.addressMode[0] = address_mode;
+                               texDesc.addressMode[1] = address_mode;
+                               texDesc.addressMode[2] = address_mode;
+                               texDesc.filterMode = filter_mode;
+                               texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+                               CUtexObject tex = 0;
+                               cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
+
+                               /* Safety check */
+                               if((uint)tex > UINT_MAX) {
+                                       assert(0);
+                               }
 
-                       cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+                               /* Resize once */
+                               if(flat_slot >= bindless_mapping.size())
+                                       bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */
 
-                       if(cubytes == 8) {
-                               /* 64 bit device pointer */
-                               uint64_t ptr = mem.device_pointer;
-                               cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
+                               /* Set Mapping and tag that we need to (re-)upload to device */
+                               bindless_mapping.get_data()[flat_slot] = (uint)tex;
+                               tex_bindless_map[mem.device_pointer] = (uint)tex;
+                               need_bindless_mapping = true;
                        }
+                       /* Regular Textures - Fermi */
                        else {
-                               /* 32 bit device pointer */
-                               uint32_t ptr = (uint32_t)mem.device_pointer;
-                               cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
+                               cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+                               cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
+                               cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
+                       }
+
+                       cuda_pop_context();
+               }
+
+               /* Fermi, Data and Image Textures */
+               if(!has_bindless_textures) {
+                       cuda_push_context();
+
+                       cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
+                       cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
+                       if(mem.data_depth > 1) {
+                               cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
                        }
 
+                       cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
+
                        cuda_pop_context();
                }
 
+               /* Fermi and Kepler */
                tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
        }
 
@@ -670,6 +747,12 @@ public:
                                cuArrayDestroy((CUarray)mem.device_pointer);
                                cuda_pop_context();
 
+                               /* Free CUtexObject (Bindless Textures) */
+                               if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
+                                       uint flat_slot = tex_bindless_map[mem.device_pointer];
+                                       cuTexObjectDestroy(flat_slot);
+                               }
+
                                tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
                                mem.device_pointer = 0;
 
@@ -1111,6 +1194,9 @@ public:
                        RenderTile tile;
                        
                        bool branched = task->integrator_branched;
+
+                       /* Upload Bindless Mapping */
+                       load_bindless_mapping();
                        
                        /* keep rendering tiles until done */
                        while(task->acquire_tile(this, tile)) {
@@ -1134,6 +1220,9 @@ public:
                        }
                }
                else if(task->type == DeviceTask::SHADER) {
+                       /* Upload Bindless Mapping */
+                       load_bindless_mapping();
+
                        shader(*task);
 
                        cuda_push_context();
@@ -1269,7 +1358,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
                info.num = num;
 
                info.advanced_shading = (major >= 2);
-               info.extended_images = (major >= 3);
+               info.has_bindless_textures = (major >= 3);
                info.pack_images = false;
 
                /* if device has a kernel timeout, assume it is used for display */