Fix #35684: cycles unable to use full 6GB of memory on NVidia Titan GPU. We now
[blender.git] / intern / cycles / device / device_cuda.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20
21 #include "device.h"
22 #include "device_intern.h"
23
24 #include "buffers.h"
25
26 #include "util_cuda.h"
27 #include "util_debug.h"
28 #include "util_map.h"
29 #include "util_opengl.h"
30 #include "util_path.h"
31 #include "util_system.h"
32 #include "util_types.h"
33 #include "util_time.h"
34
35 CCL_NAMESPACE_BEGIN
36
37 class CUDADevice : public Device
38 {
39 public:
40         DedicatedTaskPool task_pool;
41         CUdevice cuDevice;
42         CUcontext cuContext;
43         CUmodule cuModule;
44         map<device_ptr, bool> tex_interp_map;
45         int cuDevId;
46         int cuDevArchitecture;
47         bool first_error;
48         bool use_texture_storage;
49
50         struct PixelMem {
51                 GLuint cuPBO;
52                 CUgraphicsResource cuPBOresource;
53                 GLuint cuTexId;
54                 int w, h;
55         };
56
57         map<device_ptr, PixelMem> pixel_mem_map;
58
59         CUdeviceptr cuda_device_ptr(device_ptr mem)
60         {
61                 return (CUdeviceptr)mem;
62         }
63
64         static const char *cuda_error_string(CUresult result)
65         {
66                 switch(result) {
67                         case CUDA_SUCCESS: return "No errors";
68                         case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
69                         case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
70                         case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
71                         case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
72
73                         case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
74                         case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
75
76                         case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
77                         case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
78                         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
79                         case CUDA_ERROR_MAP_FAILED: return "Map failed";
80                         case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
81                         case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
82                         case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
83                         case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
84                         case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
85                         case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
86                         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
87                         case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
88                         case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
89                         case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
90
91                         case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
92                         case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
93                         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
94                         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
95
96                         case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
97
98                         case CUDA_ERROR_NOT_FOUND: return "Not found";
99
100                         case CUDA_ERROR_NOT_READY: return "CUDA not ready";
101
102                         case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
103                         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
104                         case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
105                         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
106
107                         case CUDA_ERROR_UNKNOWN: return "Unknown error";
108
109                         default: return "Unknown CUDA error value";
110                 }
111         }
112
113 /*#ifdef NDEBUG
114 #define cuda_abort()
115 #else
116 #define cuda_abort() abort()
117 #endif*/
118         void cuda_error_documentation()
119         {
120                 if(first_error) {
121                         fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
122                         fprintf(stderr, "http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/GPU_Rendering\n\n");
123                         first_error = false;
124                 }
125         }
126
127 #define cuda_assert(stmt) \
128         { \
129                 CUresult result = stmt; \
130                 \
131                 if(result != CUDA_SUCCESS) { \
132                         string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
133                         if(error_msg == "") \
134                                 error_msg = message; \
135                         fprintf(stderr, "%s\n", message.c_str()); \
136                         /*cuda_abort();*/ \
137                         cuda_error_documentation(); \
138                 } \
139         }
140
141         bool cuda_error_(CUresult result, const string& stmt)
142         {
143                 if(result == CUDA_SUCCESS)
144                         return false;
145
146                 string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuda_error_string(result));
147                 if(error_msg == "")
148                         error_msg = message;
149                 fprintf(stderr, "%s\n", message.c_str());
150                 cuda_error_documentation();
151                 return true;
152         }
153
154 #define cuda_error(stmt) cuda_error_(stmt, #stmt)
155
156         void cuda_error_message(const string& message)
157         {
158                 if(error_msg == "")
159                         error_msg = message;
160                 fprintf(stderr, "%s\n", message.c_str());
161                 cuda_error_documentation();
162         }
163
164         void cuda_push_context()
165         {
166                 cuda_assert(cuCtxSetCurrent(cuContext))
167         }
168
169         void cuda_pop_context()
170         {
171                 cuda_assert(cuCtxSetCurrent(NULL));
172         }
173
174         CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats)
175         {
176                 first_error = true;
177                 background = background_;
178                 use_texture_storage = true;
179
180                 cuDevId = info.num;
181                 cuDevice = 0;
182                 cuContext = 0;
183
184                 /* intialize */
185                 if(cuda_error(cuInit(0)))
186                         return;
187
188                 /* setup device and context */
189                 if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
190                         return;
191
192                 CUresult result;
193
194                 if(background) {
195                         result = cuCtxCreate(&cuContext, 0, cuDevice);
196                 }
197                 else {
198                         result = cuGLCtxCreate(&cuContext, 0, cuDevice);
199
200                         if(result != CUDA_SUCCESS) {
201                                 result = cuCtxCreate(&cuContext, 0, cuDevice);
202                                 background = true;
203                         }
204                 }
205
206                 if(cuda_error_(result, "cuCtxCreate"))
207                         return;
208
209                 int major, minor;
210                 cuDeviceComputeCapability(&major, &minor, cuDevId);
211                 cuDevArchitecture = major*100 + minor*10;
212
213                 /* In order to use full 6GB of memory on Titan cards, use arrays instead
214                  * of textures. On earlier cards this seems slower, but on Titan it is
215                  * actually slightly faster in tests. */
216                 use_texture_storage = (cuDevArchitecture < 350);
217
218                 cuda_pop_context();
219         }
220
221         ~CUDADevice()
222         {
223                 task_pool.stop();
224
225                 cuda_assert(cuCtxDestroy(cuContext))
226         }
227
228         bool support_device(bool experimental)
229         {
230                 if(!experimental) {
231                         int major, minor;
232                         cuDeviceComputeCapability(&major, &minor, cuDevId);
233
234                         if(major < 2) {
235                                 cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
236                                 return false;
237                         }
238                 }
239
240                 return true;
241         }
242
243         string compile_kernel()
244         {
245                 /* compute cubin name */
246                 int major, minor;
247                 cuDeviceComputeCapability(&major, &minor, cuDevId);
248
249                 /* attempt to use kernel provided with blender */
250                 string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
251                 if(path_exists(cubin))
252                         return cubin;
253
254                 /* not found, try to use locally compiled kernel */
255                 string kernel_path = path_get("kernel");
256                 string md5 = path_files_md5_hash(kernel_path);
257
258                 cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
259                 cubin = path_user_get(path_join("cache", cubin));
260
261                 /* if exists already, use it */
262                 if(path_exists(cubin))
263                         return cubin;
264
265 #ifdef _WIN32
266                 if(cuHavePrecompiledKernels()) {
267                         if(major < 2)
268                                 cuda_error_message(string_printf("CUDA device requires compute capability 2.0 or up, found %d.%d. Your GPU is not supported.", major, minor));
269                         else
270                                 cuda_error_message(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
271                         return "";
272                 }
273 #endif
274
275                 /* if not, find CUDA compiler */
276                 string nvcc = cuCompilerPath();
277
278                 if(nvcc == "") {
279                         cuda_error_message("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
280                         return "";
281                 }
282
283                 int cuda_version = cuCompilerVersion();
284
285                 if(cuda_version == 0) {
286                         cuda_error_message("CUDA nvcc compiler version could not be parsed.");
287                         return "";
288                 }
289
290                 if(cuda_version != 50)
291                         printf("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
292
293                 /* compile */
294                 string kernel = path_join(kernel_path, "kernel.cu");
295                 string include = kernel_path;
296                 const int machine = system_cpu_bits();
297                 string arch_flags;
298
299                 /* build flags depending on CUDA version and arch */
300                 if(cuda_version < 50) {
301                         /* CUDA 4.x */
302                         if(major == 1) {
303                                 /* sm_1x */
304                                 arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0";
305                         }
306                         else if(major == 2) {
307                                 /* sm_2x */
308                                 arch_flags = "--maxrregcount=24";
309                         }
310                         else {
311                                 /* sm_3x */
312                                 arch_flags = "--maxrregcount=32";
313                         }
314                 }
315                 else {
316                         /* CUDA 5.x */
317                         if(major == 1) {
318                                 /* sm_1x */
319                                 arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math";
320                         }
321                         else if(major == 2) {
322                                 /* sm_2x */
323                                 arch_flags = "--maxrregcount=32 --use_fast_math";
324                         }
325                         else {
326                                 /* sm_3x */
327                                 arch_flags = "--maxrregcount=32 --use_fast_math";
328                         }
329                 }
330
331                 double starttime = time_dt();
332                 printf("Compiling CUDA kernel ...\n");
333
334                 path_create_directories(cubin);
335
336                 string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
337                         "-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
338                         nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
339
340                 printf("%s\n", command.c_str());
341
342                 if(system(command.c_str()) == -1) {
343                         cuda_error_message("Failed to execute compilation command, see console for details.");
344                         return "";
345                 }
346
347                 /* verify if compilation succeeded */
348                 if(!path_exists(cubin)) {
349                         cuda_error_message("CUDA kernel compilation failed, see console for details.");
350                         return "";
351                 }
352
353                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
354
355                 return cubin;
356         }
357
358         bool load_kernels(bool experimental)
359         {
360                 /* check if cuda init succeeded */
361                 if(cuContext == 0)
362                         return false;
363                 
364                 /* check if GPU is supported with current feature set */
365                 if(!support_device(experimental))
366                         return false;
367
368                 /* get kernel */
369                 string cubin = compile_kernel();
370
371                 if(cubin == "")
372                         return false;
373
374                 /* open module */
375                 cuda_push_context();
376
377                 CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
378                 if(cuda_error_(result, "cuModuleLoad"))
379                         cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
380
381                 cuda_pop_context();
382
383                 return (result == CUDA_SUCCESS);
384         }
385
386         void mem_alloc(device_memory& mem, MemoryType type)
387         {
388                 cuda_push_context();
389                 CUdeviceptr device_pointer;
390                 size_t size = mem.memory_size();
391                 cuda_assert(cuMemAlloc(&device_pointer, size))
392                 mem.device_pointer = (device_ptr)device_pointer;
393                 stats.mem_alloc(size);
394                 cuda_pop_context();
395         }
396
397         void mem_copy_to(device_memory& mem)
398         {
399                 cuda_push_context();
400                 if(mem.device_pointer)
401                         cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
402                 cuda_pop_context();
403         }
404
405         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
406         {
407                 size_t offset = elem*y*w;
408                 size_t size = elem*w*h;
409
410                 cuda_push_context();
411                 if(mem.device_pointer) {
412                         cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
413                                 (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
414                 }
415                 else {
416                         memset((char*)mem.data_pointer + offset, 0, size);
417                 }
418                 cuda_pop_context();
419         }
420
421         void mem_zero(device_memory& mem)
422         {
423                 memset((void*)mem.data_pointer, 0, mem.memory_size());
424
425                 cuda_push_context();
426                 if(mem.device_pointer)
427                         cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
428                 cuda_pop_context();
429         }
430
431         void mem_free(device_memory& mem)
432         {
433                 if(mem.device_pointer) {
434                         cuda_push_context();
435                         cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
436                         cuda_pop_context();
437
438                         mem.device_pointer = 0;
439
440                         stats.mem_free(mem.memory_size());
441                 }
442         }
443
444         void const_copy_to(const char *name, void *host, size_t size)
445         {
446                 CUdeviceptr mem;
447                 size_t bytes;
448
449                 cuda_push_context();
450                 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
451                 //assert(bytes == size);
452                 cuda_assert(cuMemcpyHtoD(mem, host, size))
453                 cuda_pop_context();
454         }
455
456         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
457         {
458                 /* determine format */
459                 CUarray_format_enum format;
460                 size_t dsize = datatype_size(mem.data_type);
461                 size_t size = mem.memory_size();
462                 bool use_texture = interpolation || use_texture_storage;
463
464                 if(use_texture) {
465
466                         switch(mem.data_type) {
467                                 case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
468                                 case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
469                                 case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
470                                 case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
471                                 default: assert(0); return;
472                         }
473
474                         CUtexref texref = NULL;
475
476                         cuda_push_context();
477                         cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
478
479                         if(!texref) {
480                                 cuda_pop_context();
481                                 return;
482                         }
483
484                         if(interpolation) {
485                                 CUarray handle = NULL;
486                                 CUDA_ARRAY_DESCRIPTOR desc;
487
488                                 desc.Width = mem.data_width;
489                                 desc.Height = mem.data_height;
490                                 desc.Format = format;
491                                 desc.NumChannels = mem.data_elements;
492
493                                 cuda_assert(cuArrayCreate(&handle, &desc))
494
495                                 if(!handle) {
496                                         cuda_pop_context();
497                                         return;
498                                 }
499
500                                 if(mem.data_height > 1) {
501                                         CUDA_MEMCPY2D param;
502                                         memset(&param, 0, sizeof(param));
503                                         param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
504                                         param.dstArray = handle;
505                                         param.srcMemoryType = CU_MEMORYTYPE_HOST;
506                                         param.srcHost = (void*)mem.data_pointer;
507                                         param.srcPitch = mem.data_width*dsize*mem.data_elements;
508                                         param.WidthInBytes = param.srcPitch;
509                                         param.Height = mem.data_height;
510
511                                         cuda_assert(cuMemcpy2D(&param))
512                                 }
513                                 else
514                                         cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
515
516                                 cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
517
518                                 cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
519                                 cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
520
521                                 mem.device_pointer = (device_ptr)handle;
522
523                                 stats.mem_alloc(size);
524                         }
525                         else {
526                                 cuda_pop_context();
527
528                                 mem_alloc(mem, MEM_READ_ONLY);
529                                 mem_copy_to(mem);
530
531                                 cuda_push_context();
532
533                                 cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
534                                 cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
535                                 cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
536                         }
537
538                         if(periodic) {
539                                 cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
540                                 cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
541                         }
542                         else {
543                                 cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
544                                 cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
545                         }
546                         cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
547
548                         cuda_pop_context();
549                 }
550                 else {
551                         mem_alloc(mem, MEM_READ_ONLY);
552                         mem_copy_to(mem);
553
554                         cuda_push_context();
555
556                         CUdeviceptr cumem;
557                         size_t cubytes;
558
559                         cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name))
560
561                         if(cubytes == 8) {
562                                 /* 64 bit device pointer */
563                                 uint64_t ptr = mem.device_pointer;
564                                 cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
565                         }
566                         else {
567                                 /* 32 bit device pointer */
568                                 uint32_t ptr = (uint32_t)mem.device_pointer;
569                                 cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
570                         }
571
572                         cuda_pop_context();
573                 }
574
575                 tex_interp_map[mem.device_pointer] = interpolation;
576         }
577
578         void tex_free(device_memory& mem)
579         {
580                 if(mem.device_pointer) {
581                         if(tex_interp_map[mem.device_pointer]) {
582                                 cuda_push_context();
583                                 cuArrayDestroy((CUarray)mem.device_pointer);
584                                 cuda_pop_context();
585
586                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
587                                 mem.device_pointer = 0;
588
589                                 stats.mem_free(mem.memory_size());
590                         }
591                         else {
592                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
593                                 mem_free(mem);
594                         }
595                 }
596         }
597
598         void path_trace(RenderTile& rtile, int sample, bool branched)
599         {
600                 if(have_error())
601                         return;
602
603                 cuda_push_context();
604
605                 CUfunction cuPathTrace;
606                 CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
607                 CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
608
609                 /* get kernel function */
610                 if(branched)
611                         cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"))
612                 else
613                         cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
614
615                 if(have_error())
616                         return;
617         
618                 /* pass in parameters */
619                 int offset = 0;
620                 
621                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
622                 offset += sizeof(d_buffer);
623
624                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
625                 offset += sizeof(d_rng_state);
626
627                 offset = align_up(offset, __alignof(sample));
628
629                 cuda_assert(cuParamSeti(cuPathTrace, offset, sample))
630                 offset += sizeof(sample);
631
632                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x))
633                 offset += sizeof(rtile.x);
634
635                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y))
636                 offset += sizeof(rtile.y);
637
638                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w))
639                 offset += sizeof(rtile.w);
640
641                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h))
642                 offset += sizeof(rtile.h);
643
644                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset))
645                 offset += sizeof(rtile.offset);
646
647                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride))
648                 offset += sizeof(rtile.stride);
649
650                 cuda_assert(cuParamSetSize(cuPathTrace, offset))
651
652                 /* launch kernel: todo find optimal size, cache config for fermi */
653                 int xthreads = 16;
654                 int ythreads = 16;
655                 int xblocks = (rtile.w + xthreads - 1)/xthreads;
656                 int yblocks = (rtile.h + ythreads - 1)/ythreads;
657
658                 cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
659                 cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
660                 cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
661
662                 cuda_assert(cuCtxSynchronize())
663
664                 cuda_pop_context();
665         }
666
667         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
668         {
669                 if(have_error())
670                         return;
671
672                 cuda_push_context();
673
674                 CUfunction cuFilmConvert;
675                 CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
676                 CUdeviceptr d_buffer = cuda_device_ptr(buffer);
677
678                 /* get kernel function */
679                 if(rgba_half)
680                         cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"))
681                 else
682                         cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"))
683
684                 /* pass in parameters */
685                 int offset = 0;
686
687                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
688                 offset += sizeof(d_rgba);
689                 
690                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
691                 offset += sizeof(d_buffer);
692
693                 float sample_scale = 1.0f/(task.sample + 1);
694                 offset = align_up(offset, __alignof(sample_scale));
695
696                 cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale))
697                 offset += sizeof(sample_scale);
698
699                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
700                 offset += sizeof(task.x);
701
702                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
703                 offset += sizeof(task.y);
704
705                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
706                 offset += sizeof(task.w);
707
708                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
709                 offset += sizeof(task.h);
710
711                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
712                 offset += sizeof(task.offset);
713
714                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
715                 offset += sizeof(task.stride);
716
717                 cuda_assert(cuParamSetSize(cuFilmConvert, offset))
718
719                 /* launch kernel: todo find optimal size, cache config for fermi */
720                 int xthreads = 16;
721                 int ythreads = 16;
722                 int xblocks = (task.w + xthreads - 1)/xthreads;
723                 int yblocks = (task.h + ythreads - 1)/ythreads;
724
725                 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
726                 cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
727                 cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
728
729                 unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
730
731                 cuda_pop_context();
732         }
733
734         void shader(DeviceTask& task)
735         {
736                 if(have_error())
737                         return;
738
739                 cuda_push_context();
740
741                 CUfunction cuDisplace;
742                 CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
743                 CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
744
745                 /* get kernel function */
746                 cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
747                 
748                 /* pass in parameters */
749                 int offset = 0;
750                 
751                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
752                 offset += sizeof(d_input);
753
754                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_output, sizeof(d_output)))
755                 offset += sizeof(d_output);
756
757                 int shader_eval_type = task.shader_eval_type;
758                 offset = align_up(offset, __alignof(shader_eval_type));
759
760                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
761                 offset += sizeof(task.shader_eval_type);
762
763                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
764                 offset += sizeof(task.shader_x);
765
766                 cuda_assert(cuParamSetSize(cuDisplace, offset))
767
768                 /* launch kernel: todo find optimal size, cache config for fermi */
769                 int xthreads = 16;
770                 int xblocks = (task.shader_w + xthreads - 1)/xthreads;
771
772                 cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
773                 cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
774                 cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
775
776                 cuda_pop_context();
777         }
778
779         CUdeviceptr map_pixels(device_ptr mem)
780         {
781                 if(!background) {
782                         PixelMem pmem = pixel_mem_map[mem];
783                         CUdeviceptr buffer;
784                         
785                         size_t bytes;
786                         cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
787                         cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
788                         
789                         return buffer;
790                 }
791
792                 return cuda_device_ptr(mem);
793         }
794
795         void unmap_pixels(device_ptr mem)
796         {
797                 if(!background) {
798                         PixelMem pmem = pixel_mem_map[mem];
799
800                         cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
801                 }
802         }
803
804         void pixels_alloc(device_memory& mem)
805         {
806                 if(!background) {
807                         PixelMem pmem;
808
809                         pmem.w = mem.data_width;
810                         pmem.h = mem.data_height;
811
812                         cuda_push_context();
813
814                         glGenBuffers(1, &pmem.cuPBO);
815                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
816                         if(mem.data_type == TYPE_HALF)
817                                 glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
818                         else
819                                 glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
820                         
821                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
822                         
823                         glGenTextures(1, &pmem.cuTexId);
824                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
825                         if(mem.data_type == TYPE_HALF)
826                                 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
827                         else
828                                 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
829                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
830                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
831                         glBindTexture(GL_TEXTURE_2D, 0);
832                         
833                         CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
834
835                         if(result == CUDA_SUCCESS) {
836                                 cuda_pop_context();
837
838                                 mem.device_pointer = pmem.cuTexId;
839                                 pixel_mem_map[mem.device_pointer] = pmem;
840
841                                 stats.mem_alloc(mem.memory_size());
842
843                                 return;
844                         }
845                         else {
846                                 /* failed to register buffer, fallback to no interop */
847                                 glDeleteBuffers(1, &pmem.cuPBO);
848                                 glDeleteTextures(1, &pmem.cuTexId);
849
850                                 cuda_pop_context();
851
852                                 background = true;
853                         }
854                 }
855
856                 Device::pixels_alloc(mem);
857         }
858
859         void pixels_copy_from(device_memory& mem, int y, int w, int h)
860         {
861                 if(!background) {
862                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
863
864                         cuda_push_context();
865
866                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
867                         uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
868                         size_t offset = sizeof(uchar)*4*y*w;
869                         memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
870                         glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
871                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
872
873                         cuda_pop_context();
874
875                         return;
876                 }
877
878                 Device::pixels_copy_from(mem, y, w, h);
879         }
880
881         void pixels_free(device_memory& mem)
882         {
883                 if(mem.device_pointer) {
884                         if(!background) {
885                                 PixelMem pmem = pixel_mem_map[mem.device_pointer];
886
887                                 cuda_push_context();
888
889                                 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
890                                 glDeleteBuffers(1, &pmem.cuPBO);
891                                 glDeleteTextures(1, &pmem.cuTexId);
892
893                                 cuda_pop_context();
894
895                                 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
896                                 mem.device_pointer = 0;
897
898                                 stats.mem_free(mem.memory_size());
899
900                                 return;
901                         }
902
903                         Device::pixels_free(mem);
904                 }
905         }
906
907         void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
908         {
909                 if(!background) {
910                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
911
912                         cuda_push_context();
913
914                         /* for multi devices, this assumes the ineffecient method that we allocate
915                          * all pixels on the device even though we only render to a subset */
916                         size_t offset = 4*y*w;
917
918                         if(mem.data_type == TYPE_HALF)
919                                 offset *= sizeof(GLhalf);
920                         else
921                                 offset *= sizeof(uint8_t);
922
923                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
924                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
925                         if(mem.data_type == TYPE_HALF)
926                                 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset);
927                         else
928                                 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
929                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
930                         
931                         glEnable(GL_TEXTURE_2D);
932                         
933                         if(transparent) {
934                                 glEnable(GL_BLEND);
935                                 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
936                         }
937
938                         glColor3f(1.0f, 1.0f, 1.0f);
939
940                         glPushMatrix();
941                         glTranslatef(0.0f, (float)dy, 0.0f);
942                                 
943                         glBegin(GL_QUADS);
944                         
945                         glTexCoord2f(0.0f, 0.0f);
946                         glVertex2f(0.0f, 0.0f);
947                         glTexCoord2f((float)w/(float)pmem.w, 0.0f);
948                         glVertex2f((float)width, 0.0f);
949                         glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
950                         glVertex2f((float)width, (float)height);
951                         glTexCoord2f(0.0f, (float)h/(float)pmem.h);
952                         glVertex2f(0.0f, (float)height);
953
954                         glEnd();
955
956                         glPopMatrix();
957
958                         if(transparent)
959                                 glDisable(GL_BLEND);
960                         
961                         glBindTexture(GL_TEXTURE_2D, 0);
962                         glDisable(GL_TEXTURE_2D);
963
964                         cuda_pop_context();
965
966                         return;
967                 }
968
969                 Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
970         }
971
972         void thread_run(DeviceTask *task)
973         {
974                 if(task->type == DeviceTask::PATH_TRACE) {
975                         RenderTile tile;
976                         
977                         bool branched = task->integrator_branched;
978                         
979                         /* keep rendering tiles until done */
980                         while(task->acquire_tile(this, tile)) {
981                                 int start_sample = tile.start_sample;
982                                 int end_sample = tile.start_sample + tile.num_samples;
983
984                                 for(int sample = start_sample; sample < end_sample; sample++) {
985                                         if (task->get_cancel()) {
986                                                 if(task->need_finish_queue == false)
987                                                         break;
988                                         }
989
990                                         path_trace(tile, sample, branched);
991
992                                         tile.sample = sample + 1;
993
994                                         task->update_progress(tile);
995                                 }
996
997                                 task->release_tile(tile);
998                         }
999                 }
1000                 else if(task->type == DeviceTask::SHADER) {
1001                         shader(*task);
1002
1003                         cuda_push_context();
1004                         cuda_assert(cuCtxSynchronize())
1005                         cuda_pop_context();
1006                 }
1007         }
1008
1009         class CUDADeviceTask : public DeviceTask {
1010         public:
1011                 CUDADeviceTask(CUDADevice *device, DeviceTask& task)
1012                 : DeviceTask(task)
1013                 {
1014                         run = function_bind(&CUDADevice::thread_run, device, this);
1015                 }
1016         };
1017
1018         void task_add(DeviceTask& task)
1019         {
1020                 if(task.type == DeviceTask::FILM_CONVERT) {
1021                         /* must be done in main thread due to opengl access */
1022                         film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
1023
1024                         cuda_push_context();
1025                         cuda_assert(cuCtxSynchronize())
1026                         cuda_pop_context();
1027                 }
1028                 else {
1029                         task_pool.push(new CUDADeviceTask(this, task));
1030                 }
1031         }
1032
1033         void task_wait()
1034         {
1035                 task_pool.wait();
1036         }
1037
1038         void task_cancel()
1039         {
1040                 task_pool.cancel();
1041         }
1042 };
1043
1044 Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
1045 {
1046         return new CUDADevice(info, stats, background);
1047 }
1048
1049 void device_cuda_info(vector<DeviceInfo>& devices)
1050 {
1051         CUresult result;
1052         int count = 0;
1053
1054         result = cuInit(0);
1055         if(result != CUDA_SUCCESS) {
1056                 if(result != CUDA_ERROR_NO_DEVICE)
1057                         fprintf(stderr, "CUDA cuInit: %s\n", CUDADevice::cuda_error_string(result));
1058                 return;
1059         }
1060
1061         result = cuDeviceGetCount(&count);
1062         if(result != CUDA_SUCCESS) {
1063                 fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", CUDADevice::cuda_error_string(result));
1064                 return;
1065         }
1066         
1067         vector<DeviceInfo> display_devices;
1068         
1069         for(int num = 0; num < count; num++) {
1070                 char name[256];
1071                 int attr;
1072                 
1073                 if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
1074                         continue;
1075
1076                 DeviceInfo info;
1077
1078                 info.type = DEVICE_CUDA;
1079                 info.description = string(name);
1080                 info.id = string_printf("CUDA_%d", num);
1081                 info.num = num;
1082
1083                 int major, minor;
1084                 cuDeviceComputeCapability(&major, &minor, num);
1085                 info.advanced_shading = (major >= 2);
1086                 info.pack_images = false;
1087
1088                 /* if device has a kernel timeout, assume it is used for display */
1089                 if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
1090                         info.display_device = true;
1091                         display_devices.push_back(info);
1092                 }
1093                 else
1094                         devices.push_back(info);
1095         }
1096
1097         if(!display_devices.empty())
1098                 devices.insert(devices.end(), display_devices.begin(), display_devices.end());
1099 }
1100
1101 CCL_NAMESPACE_END
1102