Cycles: Implement automatic global size for CUDA split kernel
authorMai Lavelle <mai.lavelle@gmail.com>
Tue, 11 Apr 2017 06:36:08 +0000 (02:36 -0400)
committerMai Lavelle <mai.lavelle@gmail.com>
Tue, 11 Apr 2017 07:11:18 +0000 (03:11 -0400)
Not sure this is the best way to do things for CUDA but its much better than
being unimplemented.

intern/cycles/device/device_cuda.cpp

index 4c1a49878f5d80dcc5d20e96b4fa8d1b22c56e5b..ef283c9d455010577236524bdfa028b6ced54d9c 100644 (file)
@@ -1613,10 +1613,23 @@ int2 CUDASplitKernel::split_kernel_local_size()
        return make_int2(32, 1);
 }
 
-int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
 {
-       /* TODO(mai): implement something here to detect ideal work size */
-       return make_int2(256, 256);
+       size_t free;
+       size_t total;
+
+       device->cuda_push_context();
+       cuda_assert(cuMemGetInfo(&free, &total));
+       device->cuda_pop_context();
+
+       VLOG(1) << "Maximum device allocation size: "
+               << string_human_readable_number(free) << " bytes. ("
+               << string_human_readable_size(free) << ").";
+
+       size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+       int2 global_size = make_int2(round_down((int)sqrt(num_elements), 32), (int)sqrt(num_elements));
+       VLOG(1) << "Global size: " << global_size << ".";
+       return global_size;
 }
 
 bool device_cuda_init(void)