Fix T50888: Numeric overflow in split kernel state buffer size calculation
authorMai Lavelle <mai.lavelle@gmail.com>
Sat, 11 Mar 2017 10:23:11 +0000 (05:23 -0500)
committerMai Lavelle <mai.lavelle@gmail.com>
Sat, 11 Mar 2017 10:39:28 +0000 (05:39 -0500)
Overflow led to the state buffer being too small and the split kernel to
get stuck doing nothing forever.

intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_memory.h
intern/cycles/device/device_split_kernel.cpp
intern/cycles/device/device_split_kernel.h
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/device/opencl/opencl_split.cpp
intern/cycles/kernel/kernels/cuda/kernel_split.cu
intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
intern/cycles/kernel/split/kernel_split_data.h
intern/cycles/util/util_types.h

index 06a1568b4d62a513cff0450a2426d367fc5a0457..273c3b489362e43c7c32e950c1add464a82cb594 100644 (file)
@@ -72,7 +72,7 @@ public:
        virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
        virtual int2 split_kernel_local_size();
        virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
-       virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+       virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
 };
 
 class CPUDevice : public Device
@@ -860,7 +860,7 @@ int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memo
        return task->requested_tile_size;
 }
 
-size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
        KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
 
        return split_data_buffer_size(kg, num_threads);
index a630a3d1183b40aa18887440357b756672237826..58471ba67c2ddcc78cc05f0c278fd4026475668b 100644 (file)
@@ -89,7 +89,7 @@ class CUDASplitKernel : public DeviceSplitKernel {
 public:
        explicit CUDASplitKernel(CUDADevice *device);
 
-       virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+       virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
 
        virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
                                                    RenderTile& rtile,
@@ -1473,9 +1473,9 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device)
 {
 }
 
-size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
 {
-       device_vector<uint> size_buffer;
+       device_vector<uint64_t> size_buffer;
        size_buffer.resize(1);
        device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
 
@@ -1504,7 +1504,7 @@ size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory&
 
        device->cuda_pop_context();
 
-       device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+       device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
        device->mem_free(size_buffer);
 
        return *size_buffer.get_data();
index b69c3dad60434d87b181c154430a9daafe0e1d6a..60d166b43ba3ebd5317bc0145b6fd9eb07ca7cce 100644 (file)
@@ -48,7 +48,8 @@ enum DataType {
        TYPE_UINT,
        TYPE_INT,
        TYPE_FLOAT,
-       TYPE_HALF
+       TYPE_HALF,
+       TYPE_UINT64,
 };
 
 static inline size_t datatype_size(DataType datatype) 
@@ -59,6 +60,7 @@ static inline size_t datatype_size(DataType datatype)
                case TYPE_UINT: return sizeof(uint);
                case TYPE_INT: return sizeof(int);
                case TYPE_HALF: return sizeof(half);
+               case TYPE_UINT64: return sizeof(uint64_t);
                default: return 0;
        }
 }
@@ -160,6 +162,11 @@ template<> struct device_type_traits<half4> {
        static const int num_elements = 4;
 };
 
+template<> struct device_type_traits<uint64_t> {
+       static const DataType data_type = TYPE_UINT64;
+       static const int num_elements = 1;
+};
+
 /* Device Memory */
 
 class device_memory
index 10a642ed4d0c8feee5c2971c68f537a153c03421..5b892038ebb9fad0c99fbaa815a8dbf120b3d9e2 100644 (file)
@@ -105,9 +105,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
        return true;
 }
 
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size)
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
 {
-       size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+       uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
        return max_buffer_size / size_per_element;
 }
 
index ae61f9e38c1c52df2acd2ef1ef5285b0df5c2a59..6739e754862ef8b732f8fb3f5b3aca9e9bd3149f 100644 (file)
@@ -105,8 +105,8 @@ public:
                        device_memory& kgbuffer,
                        device_memory& kernel_data);
 
-       virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
-       size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size);
+       virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+       size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
 
        virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
                                                    RenderTile& rtile,
index c5f44f84e8c350ebacd93bcb7b293d53958f4ca7..51ff39f0ad3b81b2181e1d5984b2823f8c7e8bfa 100644 (file)
@@ -334,11 +334,11 @@ void OpenCLDeviceBase::mem_zero(device_memory& mem)
                        size_t num_threads = global_size[0] * global_size[1];
 
                        cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
-                       unsigned long long d_offset = 0;
-                       unsigned long long d_size = 0;
+                       cl_ulong d_offset = 0;
+                       cl_ulong d_size = 0;
 
                        while(d_offset < mem.memory_size()) {
-                               d_size = std::min<unsigned long long>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+                               d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
 
                                kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
 
index 89ab19ca93b440389643fbcb15624824219d0532..a09d93c625e35bf473cf18e6d0eab3ee835bc45c 100644 (file)
@@ -227,9 +227,9 @@ public:
                return kernel;
        }
 
-       virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
+       virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
        {
-               device_vector<uint> size_buffer;
+               device_vector<uint64_t> size_buffer;
                size_buffer.resize(1);
                device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
 
@@ -249,7 +249,7 @@ public:
 
                device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
 
-               device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+               device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
                device->mem_free(size_buffer);
 
                if(device->ciErr != CL_SUCCESS) {
@@ -346,8 +346,8 @@ public:
 
        virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
        {
-               size_t max_buffer_size;
-               clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
+               cl_ulong max_buffer_size;
+               clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
                VLOG(1) << "Maximum device allocation side: "
                        << string_human_readable_number(max_buffer_size) << " bytes. ("
                        << string_human_readable_size(max_buffer_size) << ").";
index 6c508c2cd79d9b45eea26e3b723d8f79b15265d7..fbdf79697d5adaabbce8938d140ae97622d30716 100644 (file)
@@ -46,7 +46,7 @@
 /* kernels */
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_state_buffer_size(uint num_threads, uint *size)
+kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
 {
        *size = split_data_buffer_size(NULL, num_threads);
 }
index 0a1843ff8bd6eabe038ac93cbd2c067dda17307d..4c9bf63ef5121056b820a6213f107719bb400259 100644 (file)
@@ -21,7 +21,7 @@ __kernel void kernel_ocl_path_trace_state_buffer_size(
         KernelGlobals *kg,
         ccl_constant KernelData *data,
         uint num_threads,
-        ccl_global uint *size)
+        ccl_global uint64_t *size)
 {
        kg->data = data;
        *size = split_data_buffer_size(kg, num_threads);
index 81dcdbaeddee8d291eed619129809368de642712..d319514c190d3dbf5f1fe69d8f576af787aff6ec 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline size_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
+ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
 {
        (void)kg;  /* Unused on CPU. */
 
-       size_t size = 0;
+       uint64_t size = 0;
 #define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
        size = size SPLIT_DATA_ENTRIES;
 #undef SPLIT_DATA_ENTRY
index 36d2f1053c718b88c723cc056795b6fe075ccd65..dcd0b78e4a4632adcb690f814622e6f286682b20 100644 (file)
@@ -106,10 +106,16 @@ typedef unsigned int uint;
 
 #endif
 
-#ifndef __KERNEL_GPU__
-
 /* Fixed Bits Types */
 
+#ifdef __KERNEL_OPENCL__
+
+typedef ulong uint64_t;
+
+#endif
+
+#ifndef __KERNEL_GPU__
+
 #ifdef _WIN32
 
 typedef signed char int8_t;
@@ -474,17 +480,17 @@ ccl_device_inline int4 make_int4(const float3& f)
 
 #endif
 
-ccl_device_inline int align_up(int offset, int alignment)
+ccl_device_inline size_t align_up(size_t offset, size_t alignment)
 {
        return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-ccl_device_inline int round_up(int x, int multiple)
+ccl_device_inline size_t round_up(size_t x, size_t multiple)
 {
        return ((x + multiple - 1) / multiple) * multiple;
 }
 
-ccl_device_inline int round_down(int x, int multiple)
+ccl_device_inline size_t round_down(size_t x, size_t multiple)
 {
        return (x / multiple) * multiple;
 }