2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
23 #include "device/device.h"
24 #include "device/device_intern.h"
25 #include "device/device_split_kernel.h"
27 #include "render/buffers.h"
29 #ifdef WITH_CUDA_DYNLOAD
32 # include "util/util_opengl.h"
36 #include "util/util_debug.h"
37 #include "util/util_logging.h"
38 #include "util/util_map.h"
39 #include "util/util_md5.h"
40 #include "util/util_opengl.h"
41 #include "util/util_path.h"
42 #include "util/util_string.h"
43 #include "util/util_system.h"
44 #include "util/util_types.h"
45 #include "util/util_time.h"
47 #include "kernel/split/kernel_split_data_types.h"
51 #ifndef WITH_CUDA_DYNLOAD
53 /* Transparently implement some functions, so majority of the file does not need
54 * to worry about difference between dynamically loaded and linked CUDA at all.
59 const char *cuewErrorString(CUresult result)
61 /* We can only give error code here without major code duplication, that
62 * should be enough since dynamic loading is only being disabled by folks
63 * who knows what they're doing anyway.
65 * NOTE: Avoid call from several threads.
68 error = string_printf("%d", result);
72 const char *cuewCompilerPath(void)
74 return CYCLES_CUDA_NVCC_EXECUTABLE;
77 int cuewCompilerVersion(void)
79 return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
83 #endif /* WITH_CUDA_DYNLOAD */
87 class CUDASplitKernel : public DeviceSplitKernel {
90 explicit CUDASplitKernel(CUDADevice *device);
92 virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
94 virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
96 int num_global_elements,
97 device_memory& kernel_globals,
98 device_memory& kernel_data_,
99 device_memory& split_data,
100 device_memory& ray_state,
101 device_memory& queue_index,
102 device_memory& use_queues_flag,
103 device_memory& work_pool_wgs);
105 virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
106 virtual int2 split_kernel_local_size();
107 virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
110 class CUDADevice : public Device
113 DedicatedTaskPool task_pool;
117 map<device_ptr, bool> tex_interp_map;
118 map<device_ptr, uint> tex_bindless_map;
120 int cuDevArchitecture;
125 CUgraphicsResource cuPBOresource;
130 map<device_ptr, PixelMem> pixel_mem_map;
132 /* Bindless Textures */
133 device_vector<uint> bindless_mapping;
134 bool need_bindless_mapping;
136 CUdeviceptr cuda_device_ptr(device_ptr mem)
138 return (CUdeviceptr)mem;
141 static bool have_precompiled_kernels()
143 string cubins_path = path_get("lib");
144 return path_exists(cubins_path);
147 virtual bool show_samples() const
149 /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
156 #define cuda_abort() abort()
158 void cuda_error_documentation()
161 fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
162 fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
167 #define cuda_assert(stmt) \
169 CUresult result = stmt; \
171 if(result != CUDA_SUCCESS) { \
172 string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
173 if(error_msg == "") \
174 error_msg = message; \
175 fprintf(stderr, "%s\n", message.c_str()); \
177 cuda_error_documentation(); \
181 bool cuda_error_(CUresult result, const string& stmt)
183 if(result == CUDA_SUCCESS)
186 string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
189 fprintf(stderr, "%s\n", message.c_str());
190 cuda_error_documentation();
194 #define cuda_error(stmt) cuda_error_(stmt, #stmt)
196 void cuda_error_message(const string& message)
200 fprintf(stderr, "%s\n", message.c_str());
201 cuda_error_documentation();
204 void cuda_push_context()
206 cuda_assert(cuCtxSetCurrent(cuContext));
209 void cuda_pop_context()
211 cuda_assert(cuCtxSetCurrent(NULL));
214 CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
215 : Device(info, stats, background_)
218 background = background_;
224 need_bindless_mapping = false;
227 if(cuda_error(cuInit(0)))
230 /* setup device and context */
231 if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
237 result = cuCtxCreate(&cuContext, 0, cuDevice);
240 result = cuGLCtxCreate(&cuContext, 0, cuDevice);
242 if(result != CUDA_SUCCESS) {
243 result = cuCtxCreate(&cuContext, 0, cuDevice);
248 if(cuda_error_(result, "cuCtxCreate"))
252 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
253 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
254 cuDevArchitecture = major*100 + minor*10;
263 if(info.has_bindless_textures) {
264 tex_free(bindless_mapping);
267 cuda_assert(cuCtxDestroy(cuContext));
270 bool support_device(const DeviceRequestedFeatures& /*requested_features*/)
273 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
274 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
276 /* We only support sm_20 and above */
278 cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
285 bool use_adaptive_compilation()
287 return DebugFlags().cuda.adaptive_compile;
290 bool use_split_kernel()
292 return DebugFlags().cuda.split_kernel;
295 /* Common NVCC flags which stays the same regardless of shading model,
296 * kernel sources md5 and only depends on compiler or compilation settings.
298 string compile_kernel_get_common_cflags(
299 const DeviceRequestedFeatures& requested_features, bool split=false)
301 const int cuda_version = cuewCompilerVersion();
302 const int machine = system_cpu_bits();
303 const string source_path = path_get("source");
304 const string include_path = source_path;
305 string cflags = string_printf("-m%d "
306 "--ptxas-options=\"-v\" "
309 "-D__KERNEL_CUDA_VERSION__=%d "
313 include_path.c_str());
314 if(use_adaptive_compilation()) {
315 cflags += " " + requested_features.get_build_options();
317 const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
319 cflags += string(" ") + string(extra_cflags);
321 #ifdef WITH_CYCLES_DEBUG
322 cflags += " -D__KERNEL_DEBUG__";
326 cflags += " -D__SPLIT__";
332 bool compile_check_compiler() {
333 const char *nvcc = cuewCompilerPath();
335 cuda_error_message("CUDA nvcc compiler not found. "
336 "Install CUDA toolkit in default location.");
339 const int cuda_version = cuewCompilerVersion();
340 VLOG(1) << "Found nvcc " << nvcc
341 << ", CUDA version " << cuda_version
343 const int major = cuda_version / 10, minor = cuda_version & 10;
344 if(cuda_version == 0) {
345 cuda_error_message("CUDA nvcc compiler version could not be parsed.");
348 if(cuda_version < 80) {
349 printf("Unsupported CUDA version %d.%d detected, "
350 "you need CUDA 8.0 or newer.\n",
354 else if(cuda_version != 80) {
355 printf("CUDA version %d.%d detected, build may succeed but only "
356 "CUDA 8.0 is officially supported.\n",
362 string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
364 /* Compute cubin name. */
366 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
367 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
369 /* Attempt to use kernel provided with Blender. */
370 if(!use_adaptive_compilation()) {
371 const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
372 : "lib/kernel_sm_%d%d.cubin",
374 VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
375 if(path_exists(cubin)) {
376 VLOG(1) << "Using precompiled kernel.";
381 const string common_cflags =
382 compile_kernel_get_common_cflags(requested_features, split);
384 /* Try to use locally compiled kernel. */
385 const string source_path = path_get("source");
386 const string kernel_md5 = path_files_md5_hash(source_path);
388 /* We include cflags into md5 so changing cuda toolkit or changing other
389 * compiler command line arguments makes sure cubin gets re-built.
391 const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
393 const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
394 : "cycles_kernel_sm%d%d_%s.cubin",
397 const string cubin = path_cache_get(path_join("kernels", cubin_file));
398 VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
399 if(path_exists(cubin)) {
400 VLOG(1) << "Using locally compiled kernel.";
405 if(have_precompiled_kernels()) {
407 cuda_error_message(string_printf(
408 "CUDA device requires compute capability 2.0 or up, "
409 "found %d.%d. Your GPU is not supported.",
413 cuda_error_message(string_printf(
414 "CUDA binary kernel for this graphics card compute "
415 "capability (%d.%d) not found.",
423 if(!compile_check_compiler()) {
426 const char *nvcc = cuewCompilerPath();
427 const string kernel = path_join(
428 path_join(source_path, "kernel"),
430 path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
431 double starttime = time_dt();
432 printf("Compiling CUDA kernel ...\n");
434 path_create_directories(cubin);
436 string command = string_printf("\"%s\" "
445 common_cflags.c_str());
447 printf("%s\n", command.c_str());
449 if(system(command.c_str()) == -1) {
450 cuda_error_message("Failed to execute compilation command, "
451 "see console for details.");
455 /* Verify if compilation succeeded */
456 if(!path_exists(cubin)) {
457 cuda_error_message("CUDA kernel compilation failed, "
458 "see console for details.");
462 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
467 bool load_kernels(const DeviceRequestedFeatures& requested_features)
469 /* check if cuda init succeeded */
473 /* check if GPU is supported */
474 if(!support_device(requested_features))
478 string cubin = compile_kernel(requested_features, use_split_kernel());
489 if(path_read_text(cubin, cubin_data))
490 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
492 result = CUDA_ERROR_FILE_NOT_FOUND;
494 if(cuda_error_(result, "cuModuleLoad"))
495 cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
499 return (result == CUDA_SUCCESS);
502 void load_bindless_mapping()
504 if(info.has_bindless_textures && need_bindless_mapping) {
505 tex_free(bindless_mapping);
506 tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
507 need_bindless_mapping = false;
511 void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
514 VLOG(1) << "Buffer allocate: " << name << ", "
515 << string_human_readable_number(mem.memory_size()) << " bytes. ("
516 << string_human_readable_size(mem.memory_size()) << ")";
520 CUdeviceptr device_pointer;
521 size_t size = mem.memory_size();
522 cuda_assert(cuMemAlloc(&device_pointer, size));
523 mem.device_pointer = (device_ptr)device_pointer;
524 mem.device_size = size;
525 stats.mem_alloc(size);
529 void mem_copy_to(device_memory& mem)
532 if(mem.device_pointer)
533 cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
537 void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
539 size_t offset = elem*y*w;
540 size_t size = elem*w*h;
543 if(mem.device_pointer) {
544 cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
545 (CUdeviceptr)(mem.device_pointer + offset), size));
548 memset((char*)mem.data_pointer + offset, 0, size);
553 void mem_zero(device_memory& mem)
555 if(mem.data_pointer) {
556 memset((void*)mem.data_pointer, 0, mem.memory_size());
560 if(mem.device_pointer)
561 cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
565 void mem_free(device_memory& mem)
567 if(mem.device_pointer) {
569 cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
572 mem.device_pointer = 0;
574 stats.mem_free(mem.device_size);
579 void const_copy_to(const char *name, void *host, size_t size)
585 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
586 //assert(bytes == size);
587 cuda_assert(cuMemcpyHtoD(mem, host, size));
591 void tex_alloc(const char *name,
593 InterpolationType interpolation,
594 ExtensionType extension)
596 VLOG(1) << "Texture allocate: " << name << ", "
597 << string_human_readable_number(mem.memory_size()) << " bytes. ("
598 << string_human_readable_size(mem.memory_size()) << ")";
600 /* Check if we are on sm_30 or above.
601 * We use arrays and bindles textures for storage there */
602 bool has_bindless_textures = info.has_bindless_textures;
604 /* General variables for both architectures */
605 string bind_name = name;
606 size_t dsize = datatype_size(mem.data_type);
607 size_t size = mem.memory_size();
609 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
611 case EXTENSION_REPEAT:
612 address_mode = CU_TR_ADDRESS_MODE_WRAP;
614 case EXTENSION_EXTEND:
615 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
618 address_mode = CU_TR_ADDRESS_MODE_BORDER;
625 CUfilter_mode filter_mode;
626 if(interpolation == INTERPOLATION_CLOSEST) {
627 filter_mode = CU_TR_FILTER_MODE_POINT;
630 filter_mode = CU_TR_FILTER_MODE_LINEAR;
633 CUarray_format_enum format;
634 switch(mem.data_type) {
635 case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
636 case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
637 case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
638 case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
639 case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
640 default: assert(0); return;
643 /* General variables for Fermi */
644 CUtexref texref = NULL;
646 if(!has_bindless_textures) {
647 if(mem.data_depth > 1) {
648 /* Kernel uses different bind names for 2d and 3d float textures,
649 * so we have to adjust couple of things here.
651 vector<string> tokens;
652 string_split(tokens, name, "_");
653 bind_name = string_printf("__tex_image_%s_3d_%s",
659 cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
668 if(interpolation == INTERPOLATION_NONE) {
669 if(has_bindless_textures) {
670 mem_alloc(NULL, mem, MEM_READ_ONLY);
678 cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
681 /* 64 bit device pointer */
682 uint64_t ptr = mem.device_pointer;
683 cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
686 /* 32 bit device pointer */
687 uint32_t ptr = (uint32_t)mem.device_pointer;
688 cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
694 mem_alloc(NULL, mem, MEM_READ_ONLY);
699 cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
700 cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
701 cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
706 /* Texture Storage */
708 CUarray handle = NULL;
712 if(mem.data_depth > 1) {
713 CUDA_ARRAY3D_DESCRIPTOR desc;
715 desc.Width = mem.data_width;
716 desc.Height = mem.data_height;
717 desc.Depth = mem.data_depth;
718 desc.Format = format;
719 desc.NumChannels = mem.data_elements;
722 cuda_assert(cuArray3DCreate(&handle, &desc));
725 CUDA_ARRAY_DESCRIPTOR desc;
727 desc.Width = mem.data_width;
728 desc.Height = mem.data_height;
729 desc.Format = format;
730 desc.NumChannels = mem.data_elements;
732 cuda_assert(cuArrayCreate(&handle, &desc));
740 /* Allocate 3D, 2D or 1D memory */
741 if(mem.data_depth > 1) {
743 memset(¶m, 0, sizeof(param));
744 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
745 param.dstArray = handle;
746 param.srcMemoryType = CU_MEMORYTYPE_HOST;
747 param.srcHost = (void*)mem.data_pointer;
748 param.srcPitch = mem.data_width*dsize*mem.data_elements;
749 param.WidthInBytes = param.srcPitch;
750 param.Height = mem.data_height;
751 param.Depth = mem.data_depth;
753 cuda_assert(cuMemcpy3D(¶m));
755 else if(mem.data_height > 1) {
757 memset(¶m, 0, sizeof(param));
758 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
759 param.dstArray = handle;
760 param.srcMemoryType = CU_MEMORYTYPE_HOST;
761 param.srcHost = (void*)mem.data_pointer;
762 param.srcPitch = mem.data_width*dsize*mem.data_elements;
763 param.WidthInBytes = param.srcPitch;
764 param.Height = mem.data_height;
766 cuda_assert(cuMemcpy2D(¶m));
769 cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
771 /* Fermi and Kepler */
772 mem.device_pointer = (device_ptr)handle;
773 mem.device_size = size;
775 stats.mem_alloc(size);
777 /* Bindless Textures - Kepler */
778 if(has_bindless_textures) {
780 if(string_startswith(name, "__tex_image")) {
781 int pos = string(name).rfind("_");
782 flat_slot = atoi(name + pos + 1);
788 CUDA_RESOURCE_DESC resDesc;
789 memset(&resDesc, 0, sizeof(resDesc));
790 resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
791 resDesc.res.array.hArray = handle;
794 CUDA_TEXTURE_DESC texDesc;
795 memset(&texDesc, 0, sizeof(texDesc));
796 texDesc.addressMode[0] = address_mode;
797 texDesc.addressMode[1] = address_mode;
798 texDesc.addressMode[2] = address_mode;
799 texDesc.filterMode = filter_mode;
800 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
803 cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
806 if((uint)tex > UINT_MAX) {
811 if(flat_slot >= bindless_mapping.size()) {
812 /* Allocate some slots in advance, to reduce amount
815 bindless_mapping.resize(flat_slot + 128);
818 /* Set Mapping and tag that we need to (re-)upload to device */
819 bindless_mapping.get_data()[flat_slot] = (uint)tex;
820 tex_bindless_map[mem.device_pointer] = (uint)tex;
821 need_bindless_mapping = true;
823 /* Regular Textures - Fermi */
825 cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
826 cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
827 cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
833 /* Fermi, Data and Image Textures */
834 if(!has_bindless_textures) {
837 cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
838 cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
839 if(mem.data_depth > 1) {
840 cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
843 cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
848 /* Fermi and Kepler */
849 tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
852 void tex_free(device_memory& mem)
854 if(mem.device_pointer) {
855 if(tex_interp_map[mem.device_pointer]) {
857 cuArrayDestroy((CUarray)mem.device_pointer);
860 /* Free CUtexObject (Bindless Textures) */
861 if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
862 uint flat_slot = tex_bindless_map[mem.device_pointer];
863 cuTexObjectDestroy(flat_slot);
866 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
867 mem.device_pointer = 0;
869 stats.mem_free(mem.device_size);
873 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
879 void path_trace(RenderTile& rtile, int sample, bool branched)
886 CUfunction cuPathTrace;
887 CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
888 CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
890 /* get kernel function */
892 cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
895 cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
901 /* pass in parameters */
902 void *args[] = {&d_buffer,
913 int threads_per_block;
914 cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
917 cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
919 printf("threads_per_block %d\n", threads_per_block);
920 printf("num_registers %d\n", num_registers);*/
922 int xthreads = (int)sqrt(threads_per_block);
923 int ythreads = (int)sqrt(threads_per_block);
924 int xblocks = (rtile.w + xthreads - 1)/xthreads;
925 int yblocks = (rtile.h + ythreads - 1)/ythreads;
927 cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
929 cuda_assert(cuLaunchKernel(cuPathTrace,
930 xblocks , yblocks, 1, /* blocks */
931 xthreads, ythreads, 1, /* threads */
934 cuda_assert(cuCtxSynchronize());
939 void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
946 CUfunction cuFilmConvert;
947 CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
948 CUdeviceptr d_buffer = cuda_device_ptr(buffer);
950 /* get kernel function */
952 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
955 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
959 float sample_scale = 1.0f/(task.sample + 1);
961 /* pass in parameters */
962 void *args[] = {&d_rgba,
973 int threads_per_block;
974 cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
976 int xthreads = (int)sqrt(threads_per_block);
977 int ythreads = (int)sqrt(threads_per_block);
978 int xblocks = (task.w + xthreads - 1)/xthreads;
979 int yblocks = (task.h + ythreads - 1)/ythreads;
981 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
983 cuda_assert(cuLaunchKernel(cuFilmConvert,
984 xblocks , yblocks, 1, /* blocks */
985 xthreads, ythreads, 1, /* threads */
988 unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
993 void shader(DeviceTask& task)
1000 CUfunction cuShader;
1001 CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
1002 CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
1003 CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
1005 /* get kernel function */
1006 if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1007 cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
1010 cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
1013 /* do tasks in smaller chunks, so we can cancel it */
1014 const int shader_chunk_size = 65536;
1015 const int start = task.shader_x;
1016 const int end = task.shader_x + task.shader_w;
1017 int offset = task.offset;
1019 bool canceled = false;
1020 for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
1021 for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
1022 int shader_w = min(shader_chunk_size, end - shader_x);
1024 /* pass in parameters */
1027 args[arg++] = &d_input;
1028 args[arg++] = &d_output;
1029 if(task.shader_eval_type < SHADER_EVAL_BAKE) {
1030 args[arg++] = &d_output_luma;
1032 args[arg++] = &task.shader_eval_type;
1033 if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1034 args[arg++] = &task.shader_filter;
1036 args[arg++] = &shader_x;
1037 args[arg++] = &shader_w;
1038 args[arg++] = &offset;
1039 args[arg++] = &sample;
1042 int threads_per_block;
1043 cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
1045 int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
1047 cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
1048 cuda_assert(cuLaunchKernel(cuShader,
1049 xblocks , 1, 1, /* blocks */
1050 threads_per_block, 1, 1, /* threads */
1053 cuda_assert(cuCtxSynchronize());
1055 if(task.get_cancel()) {
1061 task.update_progress(NULL);
1067 CUdeviceptr map_pixels(device_ptr mem)
1070 PixelMem pmem = pixel_mem_map[mem];
1074 cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
1075 cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
1080 return cuda_device_ptr(mem);
1083 void unmap_pixels(device_ptr mem)
1086 PixelMem pmem = pixel_mem_map[mem];
1088 cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
1092 void pixels_alloc(device_memory& mem)
1097 pmem.w = mem.data_width;
1098 pmem.h = mem.data_height;
1100 cuda_push_context();
1102 glGenBuffers(1, &pmem.cuPBO);
1103 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
1104 if(mem.data_type == TYPE_HALF)
1105 glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
1107 glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
1109 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
1111 glGenTextures(1, &pmem.cuTexId);
1112 glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
1113 if(mem.data_type == TYPE_HALF)
1114 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
1116 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
1117 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
1118 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
1119 glBindTexture(GL_TEXTURE_2D, 0);
1121 CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
1123 if(result == CUDA_SUCCESS) {
1126 mem.device_pointer = pmem.cuTexId;
1127 pixel_mem_map[mem.device_pointer] = pmem;
1129 mem.device_size = mem.memory_size();
1130 stats.mem_alloc(mem.device_size);
1135 /* failed to register buffer, fallback to no interop */
1136 glDeleteBuffers(1, &pmem.cuPBO);
1137 glDeleteTextures(1, &pmem.cuTexId);
1145 Device::pixels_alloc(mem);
1148 void pixels_copy_from(device_memory& mem, int y, int w, int h)
1151 PixelMem pmem = pixel_mem_map[mem.device_pointer];
1153 cuda_push_context();
1155 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
1156 uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
1157 size_t offset = sizeof(uchar)*4*y*w;
1158 memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
1159 glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
1160 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
1167 Device::pixels_copy_from(mem, y, w, h);
1170 void pixels_free(device_memory& mem)
1172 if(mem.device_pointer) {
1174 PixelMem pmem = pixel_mem_map[mem.device_pointer];
1176 cuda_push_context();
1178 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
1179 glDeleteBuffers(1, &pmem.cuPBO);
1180 glDeleteTextures(1, &pmem.cuTexId);
1184 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
1185 mem.device_pointer = 0;
1187 stats.mem_free(mem.device_size);
1188 mem.device_size = 0;
1193 Device::pixels_free(mem);
1197 void draw_pixels(device_memory& mem, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
1198 const DeviceDrawParams &draw_params)
1201 PixelMem pmem = pixel_mem_map[mem.device_pointer];
1204 cuda_push_context();
1206 /* for multi devices, this assumes the inefficient method that we allocate
1207 * all pixels on the device even though we only render to a subset */
1208 size_t offset = 4*y*w;
1210 if(mem.data_type == TYPE_HALF)
1211 offset *= sizeof(GLhalf);
1213 offset *= sizeof(uint8_t);
1215 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
1216 glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
1217 if(mem.data_type == TYPE_HALF)
1218 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset);
1220 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
1221 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
1223 glEnable(GL_TEXTURE_2D);
1227 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
1230 glColor3f(1.0f, 1.0f, 1.0f);
1232 if(draw_params.bind_display_space_shader_cb) {
1233 draw_params.bind_display_space_shader_cb();
1237 glGenBuffers(1, &vertex_buffer);
1239 glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
1240 /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
1241 glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
1243 vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
1246 /* texture coordinate - vertex pair */
1252 vpointer[4] = (float)w/(float)pmem.w;
1254 vpointer[6] = (float)width + dx;
1257 vpointer[8] = (float)w/(float)pmem.w;
1258 vpointer[9] = (float)h/(float)pmem.h;
1259 vpointer[10] = (float)width + dx;
1260 vpointer[11] = (float)height + dy;
1262 vpointer[12] = 0.0f;
1263 vpointer[13] = (float)h/(float)pmem.h;
1265 vpointer[15] = (float)height + dy;
1267 glUnmapBuffer(GL_ARRAY_BUFFER);
1270 glTexCoordPointer(2, GL_FLOAT, 4 * sizeof(float), 0);
1271 glVertexPointer(2, GL_FLOAT, 4 * sizeof(float), (char *)NULL + 2 * sizeof(float));
1273 glEnableClientState(GL_VERTEX_ARRAY);
1274 glEnableClientState(GL_TEXTURE_COORD_ARRAY);
1276 glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
1278 glDisableClientState(GL_TEXTURE_COORD_ARRAY);
1279 glDisableClientState(GL_VERTEX_ARRAY);
1281 glBindBuffer(GL_ARRAY_BUFFER, 0);
1283 if(draw_params.unbind_display_space_shader_cb) {
1284 draw_params.unbind_display_space_shader_cb();
1288 glDisable(GL_BLEND);
1290 glBindTexture(GL_TEXTURE_2D, 0);
1291 glDisable(GL_TEXTURE_2D);
1298 Device::draw_pixels(mem, y, w, h, dx, dy, width, height, transparent, draw_params);
1301 void thread_run(DeviceTask *task)
1303 if(task->type == DeviceTask::PATH_TRACE) {
1306 bool branched = task->integrator_branched;
1308 /* Upload Bindless Mapping */
1309 load_bindless_mapping();
1311 if(!use_split_kernel()) {
1312 /* keep rendering tiles until done */
1313 while(task->acquire_tile(this, tile)) {
1314 int start_sample = tile.start_sample;
1315 int end_sample = tile.start_sample + tile.num_samples;
1317 for(int sample = start_sample; sample < end_sample; sample++) {
1318 if(task->get_cancel()) {
1319 if(task->need_finish_queue == false)
1323 path_trace(tile, sample, branched);
1325 tile.sample = sample + 1;
1327 task->update_progress(&tile, tile.w*tile.h);
1330 task->release_tile(tile);
1334 DeviceRequestedFeatures requested_features;
1335 if(!use_adaptive_compilation()) {
1336 requested_features.max_closure = 64;
1339 CUDASplitKernel split_kernel(this);
1340 split_kernel.load_kernels(requested_features);
1342 while(task->acquire_tile(this, tile)) {
1343 device_memory void_buffer;
1344 split_kernel.path_trace(task, tile, void_buffer, void_buffer);
1346 task->release_tile(tile);
1348 if(task->get_cancel()) {
1349 if(task->need_finish_queue == false)
1355 else if(task->type == DeviceTask::SHADER) {
1356 /* Upload Bindless Mapping */
1357 load_bindless_mapping();
1361 cuda_push_context();
1362 cuda_assert(cuCtxSynchronize());
1367 class CUDADeviceTask : public DeviceTask {
1369 CUDADeviceTask(CUDADevice *device, DeviceTask& task)
1372 run = function_bind(&CUDADevice::thread_run, device, this);
1376 int get_split_task_count(DeviceTask& /*task*/)
1381 void task_add(DeviceTask& task)
1383 if(task.type == DeviceTask::FILM_CONVERT) {
1384 /* must be done in main thread due to opengl access */
1385 film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
1387 cuda_push_context();
1388 cuda_assert(cuCtxSynchronize());
1392 task_pool.push(new CUDADeviceTask(this, task));
1406 friend class CUDASplitKernelFunction;
1407 friend class CUDASplitKernel;
1410 /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
1411 * now that the definition of that class is complete
1414 #define cuda_assert(stmt) \
1416 CUresult result = stmt; \
1418 if(result != CUDA_SUCCESS) { \
1419 string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
1420 if(device->error_msg == "") \
1421 device->error_msg = message; \
1422 fprintf(stderr, "%s\n", message.c_str()); \
1424 device->cuda_error_documentation(); \
1430 class CUDASplitKernelFunction : public SplitKernelFunction{
1434 CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
1436 /* enqueue the kernel, returns false if there is an error */
1437 bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
1439 return enqueue(dim, NULL);
1442 /* enqueue the kernel, returns false if there is an error */
1443 bool enqueue(const KernelDimensions &dim, void *args[])
1445 device->cuda_push_context();
1447 if(device->have_error())
1450 /* we ignore dim.local_size for now, as this is faster */
1451 int threads_per_block;
1452 cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
1454 int xthreads = (int)sqrt(threads_per_block);
1455 int ythreads = (int)sqrt(threads_per_block);
1457 int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
1458 int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
1460 cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
1462 cuda_assert(cuLaunchKernel(func,
1463 xblocks , yblocks, 1, /* blocks */
1464 xthreads, ythreads, 1, /* threads */
1467 device->cuda_pop_context();
1469 return !device->have_error();
1473 CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
1477 uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
1479 device_vector<uint64_t> size_buffer;
1480 size_buffer.resize(1);
1481 device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
1483 device->cuda_push_context();
1485 uint threads = num_threads;
1486 CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
1498 CUfunction state_buffer_size;
1499 cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
1501 cuda_assert(cuLaunchKernel(state_buffer_size,
1504 0, 0, (void**)&args, 0));
1506 device->cuda_pop_context();
1508 device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
1509 device->mem_free(size_buffer);
1511 return *size_buffer.get_data();
1514 bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
1516 int num_global_elements,
1517 device_memory& /*kernel_globals*/,
1518 device_memory& /*kernel_data*/,
1519 device_memory& split_data,
1520 device_memory& ray_state,
1521 device_memory& queue_index,
1522 device_memory& use_queues_flag,
1523 device_memory& work_pool_wgs)
1525 device->cuda_push_context();
1527 CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
1528 CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
1529 CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
1530 CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
1531 CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
1533 CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
1534 CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
1536 int end_sample = rtile.start_sample + rtile.num_samples;
1537 int queue_size = dim.global_size[0] * dim.global_size[1];
1540 CUdeviceptr* split_data_buffer;
1542 CUdeviceptr* ray_state;
1543 CUdeviceptr* rng_state;
1552 CUdeviceptr* queue_index;
1554 CUdeviceptr* use_queues_flag;
1555 CUdeviceptr* work_pool_wgs;
1557 CUdeviceptr* buffer;
1562 &num_global_elements,
1565 &rtile.start_sample,
1581 CUfunction data_init;
1582 cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
1583 if(device->have_error()) {
1587 CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
1589 device->cuda_pop_context();
1591 return !device->have_error();
1594 SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
1598 device->cuda_push_context();
1600 cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
1601 if(device->have_error()) {
1602 device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
1606 device->cuda_pop_context();
1608 return new CUDASplitKernelFunction(device, func);
1611 int2 CUDASplitKernel::split_kernel_local_size()
1613 return make_int2(32, 1);
1616 int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/)
1618 /* TODO(mai): implement something here to detect ideal work size */
1619 return make_int2(256, 256);
1622 bool device_cuda_init(void)
1624 #ifdef WITH_CUDA_DYNLOAD
1625 static bool initialized = false;
1626 static bool result = false;
1632 int cuew_result = cuewInit();
1633 if(cuew_result == CUEW_SUCCESS) {
1634 VLOG(1) << "CUEW initialization succeeded";
1635 if(CUDADevice::have_precompiled_kernels()) {
1636 VLOG(1) << "Found precompiled kernels";
1640 else if(cuewCompilerPath() != NULL) {
1641 VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
1645 VLOG(1) << "Neither precompiled kernels nor CUDA compiler wad found,"
1646 << " unable to use CUDA";
1651 VLOG(1) << "CUEW initialization failed: "
1652 << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED)
1653 ? "Error setting up atexit() handler"
1654 : "Error opening the library");
1658 #else /* WITH_CUDA_DYNLOAD */
1660 #endif /* WITH_CUDA_DYNLOAD */
1663 Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
1665 return new CUDADevice(info, stats, background);
1668 void device_cuda_info(vector<DeviceInfo>& devices)
1674 if(result != CUDA_SUCCESS) {
1675 if(result != CUDA_ERROR_NO_DEVICE)
1676 fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
1680 result = cuDeviceGetCount(&count);
1681 if(result != CUDA_SUCCESS) {
1682 fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
1686 vector<DeviceInfo> display_devices;
1688 for(int num = 0; num < count; num++) {
1692 if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
1696 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
1703 info.type = DEVICE_CUDA;
1704 info.description = string(name);
1707 info.advanced_shading = (major >= 2);
1708 info.has_bindless_textures = (major >= 3);
1709 info.pack_images = false;
1711 int pci_location[3] = {0, 0, 0};
1712 cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
1713 cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
1714 cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
1715 info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
1717 (unsigned int)pci_location[0],
1718 (unsigned int)pci_location[1],
1719 (unsigned int)pci_location[2]);
1721 /* if device has a kernel timeout, assume it is used for display */
1722 if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
1723 info.description += " (Display)";
1724 info.display_device = true;
1725 display_devices.push_back(info);
1728 devices.push_back(info);
1731 if(!display_devices.empty())
1732 devices.insert(devices.end(), display_devices.begin(), display_devices.end());
1735 string device_cuda_capabilities(void)
1737 CUresult result = cuInit(0);
1738 if(result != CUDA_SUCCESS) {
1739 if(result != CUDA_ERROR_NO_DEVICE) {
1740 return string("Error initializing CUDA: ") + cuewErrorString(result);
1742 return "No CUDA device found\n";
1746 result = cuDeviceGetCount(&count);
1747 if(result != CUDA_SUCCESS) {
1748 return string("Error getting devices: ") + cuewErrorString(result);
1751 string capabilities = "";
1752 for(int num = 0; num < count; num++) {
1754 if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
1757 capabilities += string("\t") + name + "\n";
1759 #define GET_ATTR(attr) \
1761 if(cuDeviceGetAttribute(&value, \
1762 CU_DEVICE_ATTRIBUTE_##attr, \
1763 num) == CUDA_SUCCESS) \
1765 capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \
1769 /* TODO(sergey): Strip all attributes which are not useful for us
1770 * or does not depend on the driver.
1772 GET_ATTR(MAX_THREADS_PER_BLOCK);
1773 GET_ATTR(MAX_BLOCK_DIM_X);
1774 GET_ATTR(MAX_BLOCK_DIM_Y);
1775 GET_ATTR(MAX_BLOCK_DIM_Z);
1776 GET_ATTR(MAX_GRID_DIM_X);
1777 GET_ATTR(MAX_GRID_DIM_Y);
1778 GET_ATTR(MAX_GRID_DIM_Z);
1779 GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
1780 GET_ATTR(SHARED_MEMORY_PER_BLOCK);
1781 GET_ATTR(TOTAL_CONSTANT_MEMORY);
1782 GET_ATTR(WARP_SIZE);
1783 GET_ATTR(MAX_PITCH);
1784 GET_ATTR(MAX_REGISTERS_PER_BLOCK);
1785 GET_ATTR(REGISTERS_PER_BLOCK);
1786 GET_ATTR(CLOCK_RATE);
1787 GET_ATTR(TEXTURE_ALIGNMENT);
1788 GET_ATTR(GPU_OVERLAP);
1789 GET_ATTR(MULTIPROCESSOR_COUNT);
1790 GET_ATTR(KERNEL_EXEC_TIMEOUT);
1791 GET_ATTR(INTEGRATED);
1792 GET_ATTR(CAN_MAP_HOST_MEMORY);
1793 GET_ATTR(COMPUTE_MODE);
1794 GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
1795 GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
1796 GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
1797 GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
1798 GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
1799 GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
1800 GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
1801 GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
1802 GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
1803 GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
1804 GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
1805 GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
1806 GET_ATTR(SURFACE_ALIGNMENT);
1807 GET_ATTR(CONCURRENT_KERNELS);
1808 GET_ATTR(ECC_ENABLED);
1809 GET_ATTR(TCC_DRIVER);
1810 GET_ATTR(MEMORY_CLOCK_RATE);
1811 GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
1812 GET_ATTR(L2_CACHE_SIZE);
1813 GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
1814 GET_ATTR(ASYNC_ENGINE_COUNT);
1815 GET_ATTR(UNIFIED_ADDRESSING);
1816 GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
1817 GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
1818 GET_ATTR(CAN_TEX2D_GATHER);
1819 GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
1820 GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
1821 GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
1822 GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
1823 GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
1824 GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
1825 GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
1826 GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
1827 GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
1828 GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
1829 GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
1830 GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
1831 GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
1832 GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
1833 GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
1834 GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
1835 GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
1836 GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
1837 GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
1838 GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
1839 GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
1840 GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
1841 GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
1842 GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
1843 GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
1844 GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
1845 GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
1846 GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
1847 GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
1848 GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
1849 GET_ATTR(COMPUTE_CAPABILITY_MINOR);
1850 GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
1851 GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
1852 GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
1853 GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
1854 GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
1855 GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
1856 GET_ATTR(MANAGED_MEMORY);
1857 GET_ATTR(MULTI_GPU_BOARD);
1858 GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
1860 capabilities += "\n";
1863 return capabilities;