Fix #35684: cycles unable to use full 6GB of memory on NVidia Titan GPU. We now
authorBrecht Van Lommel <brechtvanlommel@pandora.be>
Fri, 27 Sep 2013 19:09:31 +0000 (19:09 +0000)
committerBrecht Van Lommel <brechtvanlommel@pandora.be>
Fri, 27 Sep 2013 19:09:31 +0000 (19:09 +0000)
use arrays instead of textures for general storage on this card (image textures
are still stored as texture). Textures were found to be faster on older cards,
but the limits on 1D texture size have not increased along with the memory size,
which meant that the full 6 GB could not be used.

The performance actually seems to be slightly better with arrays in some tests
on Titan. For older cards there seems to be a bit of a mix, some are better and
others not. We may change those to use arrays too, but more testing is needed,
only Titan and Tesla K20 (sm_35) is changed for now.

The fact that arrays are faster is a bit surprising, as others found textures
to be faster on Kepler. However even if they were, the memory limitation is
more important to solve anyway.
https://research.nvidia.com/publication/understanding-efficiency-ray-traversal-gpus-kepler-and-fermi-addendum

intern/cycles/device/device_cuda.cpp
intern/cycles/kernel/kernel_bvh.h
intern/cycles/kernel/kernel_compat_cuda.h
intern/cycles/kernel/kernel_globals.h
intern/cycles/kernel/kernel_primitive.h

index b5eaa69bf0e8e2343baaeb1a98fdb06a1f6274c0..5440bd91987eda9368a4a8aba55a8cf97275e4f9 100644 (file)
@@ -43,7 +43,9 @@ public:
        CUmodule cuModule;
        map<device_ptr, bool> tex_interp_map;
        int cuDevId;
+       int cuDevArchitecture;
        bool first_error;
+       bool use_texture_storage;
 
        struct PixelMem {
                GLuint cuPBO;
@@ -173,6 +175,7 @@ public:
        {
                first_error = true;
                background = background_;
+               use_texture_storage = true;
 
                cuDevId = info.num;
                cuDevice = 0;
@@ -203,6 +206,15 @@ public:
                if(cuda_error_(result, "cuCtxCreate"))
                        return;
 
+               int major, minor;
+               cuDeviceComputeCapability(&major, &minor, cuDevId);
+               cuDevArchitecture = major*100 + minor*10;
+
+               /* In order to use full 6GB of memory on Titan cards, use arrays instead
+                * of textures. On earlier cards this seems slower, but on Titan it is
+                * actually slightly faster in tests. */
+               use_texture_storage = (cuDevArchitecture < 350);
+
                cuda_pop_context();
        }
 
@@ -210,8 +222,7 @@ public:
        {
                task_pool.stop();
 
-               cuda_push_context();
-               cuda_assert(cuCtxDetach(cuContext))
+               cuda_assert(cuCtxDestroy(cuContext))
        }
 
        bool support_device(bool experimental)
@@ -448,90 +459,118 @@ public:
                CUarray_format_enum format;
                size_t dsize = datatype_size(mem.data_type);
                size_t size = mem.memory_size();
+               bool use_texture = interpolation || use_texture_storage;
 
-               switch(mem.data_type) {
-                       case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
-                       case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
-                       case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
-                       case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
-                       default: assert(0); return;
-               }
-
-               CUtexref texref = NULL;
-
-               cuda_push_context();
-               cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
-
-               if(!texref) {
-                       cuda_pop_context();
-                       return;
-               }
+               if(use_texture) {
 
-               if(interpolation) {
-                       CUarray handle = NULL;
-                       CUDA_ARRAY_DESCRIPTOR desc;
+                       switch(mem.data_type) {
+                               case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+                               case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+                               case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+                               case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+                               default: assert(0); return;
+                       }
 
-                       desc.Width = mem.data_width;
-                       desc.Height = mem.data_height;
-                       desc.Format = format;
-                       desc.NumChannels = mem.data_elements;
+                       CUtexref texref = NULL;
 
-                       cuda_assert(cuArrayCreate(&handle, &desc))
+                       cuda_push_context();
+                       cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
 
-                       if(!handle) {
+                       if(!texref) {
                                cuda_pop_context();
                                return;
                        }
 
-                       if(mem.data_height > 1) {
-                               CUDA_MEMCPY2D param;
-                               memset(&param, 0, sizeof(param));
-                               param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                               param.dstArray = handle;
-                               param.srcMemoryType = CU_MEMORYTYPE_HOST;
-                               param.srcHost = (void*)mem.data_pointer;
-                               param.srcPitch = mem.data_width*dsize*mem.data_elements;
-                               param.WidthInBytes = param.srcPitch;
-                               param.Height = mem.data_height;
-
-                               cuda_assert(cuMemcpy2D(&param))
+                       if(interpolation) {
+                               CUarray handle = NULL;
+                               CUDA_ARRAY_DESCRIPTOR desc;
+
+                               desc.Width = mem.data_width;
+                               desc.Height = mem.data_height;
+                               desc.Format = format;
+                               desc.NumChannels = mem.data_elements;
+
+                               cuda_assert(cuArrayCreate(&handle, &desc))
+
+                               if(!handle) {
+                                       cuda_pop_context();
+                                       return;
+                               }
+
+                               if(mem.data_height > 1) {
+                                       CUDA_MEMCPY2D param;
+                                       memset(&param, 0, sizeof(param));
+                                       param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+                                       param.dstArray = handle;
+                                       param.srcMemoryType = CU_MEMORYTYPE_HOST;
+                                       param.srcHost = (void*)mem.data_pointer;
+                                       param.srcPitch = mem.data_width*dsize*mem.data_elements;
+                                       param.WidthInBytes = param.srcPitch;
+                                       param.Height = mem.data_height;
+
+                                       cuda_assert(cuMemcpy2D(&param))
+                               }
+                               else
+                                       cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
+
+                               cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
+
+                               cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
+                               cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
+
+                               mem.device_pointer = (device_ptr)handle;
+
+                               stats.mem_alloc(size);
                        }
-                       else
-                               cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
+                       else {
+                               cuda_pop_context();
+
+                               mem_alloc(mem, MEM_READ_ONLY);
+                               mem_copy_to(mem);
 
-                       cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
+                               cuda_push_context();
 
-                       cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
-                       cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
+                               cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
+                               cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
+                               cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
+                       }
 
-                       mem.device_pointer = (device_ptr)handle;
+                       if(periodic) {
+                               cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
+                               cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
+                       }
+                       else {
+                               cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
+                               cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
+                       }
+                       cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
 
-                       stats.mem_alloc(size);
+                       cuda_pop_context();
                }
                else {
-                       cuda_pop_context();
-
                        mem_alloc(mem, MEM_READ_ONLY);
                        mem_copy_to(mem);
 
                        cuda_push_context();
 
-                       cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
-                       cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
-                       cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
-               }
+                       CUdeviceptr cumem;
+                       size_t cubytes;
 
-               if(periodic) {
-                       cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
-                       cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
-               }
-               else {
-                       cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
-                       cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
-               }
-               cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
+                       cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name))
 
-               cuda_pop_context();
+                       if(cubytes == 8) {
+                               /* 64 bit device pointer */
+                               uint64_t ptr = mem.device_pointer;
+                               cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
+                       }
+                       else {
+                               /* 32 bit device pointer */
+                               uint32_t ptr = (uint32_t)mem.device_pointer;
+                               cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
+                       }
+
+                       cuda_pop_context();
+               }
 
                tex_interp_map[mem.device_pointer] = interpolation;
        }
index 4b01f2eebcdb32a66fa7844503035ac1a307e858..44a9822c103d65ddfbc09ec968a9b9b23c5529bf 100644 (file)
@@ -809,11 +809,16 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters
 #include "kernel_bvh_subsurface.h"
 #endif
 
-
+/* to work around titan bug when using arrays instead of textures */
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+__device_inline
+#else
+__device_noinline
+#endif
 #ifdef __HAIR__ 
-__device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
 #else
-__device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
 #endif
 {
 #ifdef __OBJECT_MOTION__
@@ -851,8 +856,14 @@ __device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const ui
 #endif /* __KERNEL_CPU__ */
 }
 
+/* to work around titan bug when using arrays instead of textures */
 #ifdef __SUBSURFACE__
-__device_inline uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+__device_inline
+#else
+__device_noinline
+#endif
+uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
 {
 #ifdef __OBJECT_MOTION__
        if(kernel_data.bvh.have_motion) {
index cb86ce8c4ae024a5f317adbc9c45034696711428..44c2b9effe9286832cce9e19c9a03ed25aa09747 100644 (file)
@@ -57,7 +57,18 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 /* Macros to handle different memory storage on different devices */
 
+/* In order to use full 6GB of memory on Titan cards, use arrays instead
+ * of textures. On earlier cards this seems slower, but on Titan it is
+ * actually slightly faster in tests. */
+#if __CUDA_ARCH__ < 350
+#define __KERNEL_CUDA_TEX_STORAGE__
+#endif
+
+#ifdef __KERNEL_CUDA_TEX_STORAGE__
 #define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
+#else
+#define kernel_tex_fetch(t, index) t[(index)]
+#endif
 #define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
 
 #define kernel_data __data
index ab0a717b5928acf4a6987d4f3c58356d740d3054..b5e691eb615c98321b6979fc2a4d6ea9f97d55b5 100644 (file)
@@ -66,7 +66,11 @@ typedef struct KernelGlobals {
 __constant__ KernelData __data;
 typedef struct KernelGlobals {} KernelGlobals;
 
+#ifdef __KERNEL_CUDA_TEX_STORAGE__
 #define KERNEL_TEX(type, ttype, name) ttype name;
+#else
+#define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
+#endif
 #define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
 #include "kernel_textures.h"
 
index 4a06dff84bfba5c307c9fe58791459dd3931033f..636cfd065321701655c1f6cb8023c7d4153cdb19 100644 (file)
@@ -93,7 +93,11 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
        if(sd->segment != ~0)
+#ifdef __DPDU__
                return normalize(sd->dPdu);
+#else
+               return make_float3(0.0f, 0.0f, 0.0f);
+#endif
 #endif
 
        /* try to create spherical tangent from generated coordinates */
@@ -108,7 +112,11 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
        }
        else {
                /* otherwise use surface derivatives */
+#ifdef __DPDU__
                return normalize(sd->dPdu);
+#else
+               return make_float3(0.0f, 0.0f, 0.0f);
+#endif
        }
 }