Elbeem: fix memory leak and add guarded allocator directives
[blender.git] / intern / cycles / device / device_cuda.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20
21 #include "device.h"
22 #include "device_intern.h"
23
24 #include "buffers.h"
25
26 #include "util_cuda.h"
27 #include "util_debug.h"
28 #include "util_map.h"
29 #include "util_opengl.h"
30 #include "util_path.h"
31 #include "util_system.h"
32 #include "util_types.h"
33 #include "util_time.h"
34
35 CCL_NAMESPACE_BEGIN
36
37 class CUDADevice : public Device
38 {
39 public:
40         DedicatedTaskPool task_pool;
41         CUdevice cuDevice;
42         CUcontext cuContext;
43         CUmodule cuModule;
44         map<device_ptr, bool> tex_interp_map;
45         int cuDevId;
46         bool first_error;
47
48         struct PixelMem {
49                 GLuint cuPBO;
50                 CUgraphicsResource cuPBOresource;
51                 GLuint cuTexId;
52                 int w, h;
53         };
54
55         map<device_ptr, PixelMem> pixel_mem_map;
56
57         CUdeviceptr cuda_device_ptr(device_ptr mem)
58         {
59                 return (CUdeviceptr)mem;
60         }
61
62         static const char *cuda_error_string(CUresult result)
63         {
64                 switch(result) {
65                         case CUDA_SUCCESS: return "No errors";
66                         case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
67                         case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
68                         case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
69                         case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
70
71                         case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
72                         case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
73
74                         case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
75                         case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
76                         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
77                         case CUDA_ERROR_MAP_FAILED: return "Map failed";
78                         case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
79                         case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
80                         case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
81                         case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
82                         case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
83                         case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
84                         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
85                         case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
86                         case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
87                         case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
88
89                         case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
90                         case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
91                         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
92                         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
93
94                         case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
95
96                         case CUDA_ERROR_NOT_FOUND: return "Not found";
97
98                         case CUDA_ERROR_NOT_READY: return "CUDA not ready";
99
100                         case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
101                         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
102                         case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
103                         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
104
105                         case CUDA_ERROR_UNKNOWN: return "Unknown error";
106
107                         default: return "Unknown CUDA error value";
108                 }
109         }
110
111 /*#ifdef NDEBUG
112 #define cuda_abort()
113 #else
114 #define cuda_abort() abort()
115 #endif*/
116         void cuda_error_documentation()
117         {
118                 if(first_error) {
119                         fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
120                         fprintf(stderr, "http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/GPU_Rendering\n\n");
121                         first_error = false;
122                 }
123         }
124
125 #define cuda_assert(stmt) \
126         { \
127                 CUresult result = stmt; \
128                 \
129                 if(result != CUDA_SUCCESS) { \
130                         string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
131                         if(error_msg == "") \
132                                 error_msg = message; \
133                         fprintf(stderr, "%s\n", message.c_str()); \
134                         /*cuda_abort();*/ \
135                         cuda_error_documentation(); \
136                 } \
137         }
138
139         bool cuda_error_(CUresult result, const string& stmt)
140         {
141                 if(result == CUDA_SUCCESS)
142                         return false;
143
144                 string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuda_error_string(result));
145                 if(error_msg == "")
146                         error_msg = message;
147                 fprintf(stderr, "%s\n", message.c_str());
148                 cuda_error_documentation();
149                 return true;
150         }
151
152 #define cuda_error(stmt) cuda_error_(stmt, #stmt)
153
154         void cuda_error_message(const string& message)
155         {
156                 if(error_msg == "")
157                         error_msg = message;
158                 fprintf(stderr, "%s\n", message.c_str());
159                 cuda_error_documentation();
160         }
161
162         void cuda_push_context()
163         {
164                 cuda_assert(cuCtxSetCurrent(cuContext))
165         }
166
167         void cuda_pop_context()
168         {
169                 cuda_assert(cuCtxSetCurrent(NULL));
170         }
171
172         CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats)
173         {
174                 first_error = true;
175                 background = background_;
176
177                 cuDevId = info.num;
178                 cuDevice = 0;
179                 cuContext = 0;
180
181                 /* intialize */
182                 if(cuda_error(cuInit(0)))
183                         return;
184
185                 /* setup device and context */
186                 if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
187                         return;
188
189                 CUresult result;
190
191                 if(background) {
192                         result = cuCtxCreate(&cuContext, 0, cuDevice);
193                 }
194                 else {
195                         result = cuGLCtxCreate(&cuContext, 0, cuDevice);
196
197                         if(result != CUDA_SUCCESS) {
198                                 result = cuCtxCreate(&cuContext, 0, cuDevice);
199                                 background = true;
200                         }
201                 }
202
203                 if(cuda_error_(result, "cuCtxCreate"))
204                         return;
205
206                 cuda_pop_context();
207         }
208
209         ~CUDADevice()
210         {
211                 task_pool.stop();
212
213                 cuda_push_context();
214                 cuda_assert(cuCtxDetach(cuContext))
215         }
216
217         bool support_device(bool experimental)
218         {
219                 if(!experimental) {
220                         int major, minor;
221                         cuDeviceComputeCapability(&major, &minor, cuDevId);
222
223                         if(major < 2) {
224                                 cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
225                                 return false;
226                         }
227                 }
228
229                 return true;
230         }
231
232         string compile_kernel()
233         {
234                 /* compute cubin name */
235                 int major, minor;
236                 cuDeviceComputeCapability(&major, &minor, cuDevId);
237
238                 /* attempt to use kernel provided with blender */
239                 string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
240                 if(path_exists(cubin))
241                         return cubin;
242
243                 /* not found, try to use locally compiled kernel */
244                 string kernel_path = path_get("kernel");
245                 string md5 = path_files_md5_hash(kernel_path);
246
247                 cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
248                 cubin = path_user_get(path_join("cache", cubin));
249
250                 /* if exists already, use it */
251                 if(path_exists(cubin))
252                         return cubin;
253
254 #ifdef _WIN32
255                 if(cuHavePrecompiledKernels()) {
256                         if(major < 2)
257                                 cuda_error_message(string_printf("CUDA device requires compute capability 2.0 or up, found %d.%d. Your GPU is not supported.", major, minor));
258                         else
259                                 cuda_error_message(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
260                         return "";
261                 }
262 #endif
263
264                 /* if not, find CUDA compiler */
265                 string nvcc = cuCompilerPath();
266
267                 if(nvcc == "") {
268                         cuda_error_message("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
269                         return "";
270                 }
271
272                 int cuda_version = cuCompilerVersion();
273
274                 if(cuda_version == 0) {
275                         cuda_error_message("CUDA nvcc compiler version could not be parsed.");
276                         return "";
277                 }
278
279                 if(cuda_version != 50)
280                         printf("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
281
282                 /* compile */
283                 string kernel = path_join(kernel_path, "kernel.cu");
284                 string include = kernel_path;
285                 const int machine = system_cpu_bits();
286                 string arch_flags;
287
288                 /* build flags depending on CUDA version and arch */
289                 if(cuda_version < 50) {
290                         /* CUDA 4.x */
291                         if(major == 1) {
292                                 /* sm_1x */
293                                 arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0";
294                         }
295                         else if(major == 2) {
296                                 /* sm_2x */
297                                 arch_flags = "--maxrregcount=24";
298                         }
299                         else {
300                                 /* sm_3x */
301                                 arch_flags = "--maxrregcount=32";
302                         }
303                 }
304                 else {
305                         /* CUDA 5.x */
306                         if(major == 1) {
307                                 /* sm_1x */
308                                 arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math";
309                         }
310                         else if(major == 2) {
311                                 /* sm_2x */
312                                 arch_flags = "--maxrregcount=32 --use_fast_math";
313                         }
314                         else {
315                                 /* sm_3x */
316                                 arch_flags = "--maxrregcount=32 --use_fast_math";
317                         }
318                 }
319
320                 double starttime = time_dt();
321                 printf("Compiling CUDA kernel ...\n");
322
323                 path_create_directories(cubin);
324
325                 string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
326                         "-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
327                         nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
328
329                 printf("%s\n", command.c_str());
330
331                 if(system(command.c_str()) == -1) {
332                         cuda_error_message("Failed to execute compilation command, see console for details.");
333                         return "";
334                 }
335
336                 /* verify if compilation succeeded */
337                 if(!path_exists(cubin)) {
338                         cuda_error_message("CUDA kernel compilation failed, see console for details.");
339                         return "";
340                 }
341
342                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
343
344                 return cubin;
345         }
346
347         bool load_kernels(bool experimental)
348         {
349                 /* check if cuda init succeeded */
350                 if(cuContext == 0)
351                         return false;
352                 
353                 /* check if GPU is supported with current feature set */
354                 if(!support_device(experimental))
355                         return false;
356
357                 /* get kernel */
358                 string cubin = compile_kernel();
359
360                 if(cubin == "")
361                         return false;
362
363                 /* open module */
364                 cuda_push_context();
365
366                 CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
367                 if(cuda_error_(result, "cuModuleLoad"))
368                         cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
369
370                 cuda_pop_context();
371
372                 return (result == CUDA_SUCCESS);
373         }
374
375         void mem_alloc(device_memory& mem, MemoryType type)
376         {
377                 cuda_push_context();
378                 CUdeviceptr device_pointer;
379                 size_t size = mem.memory_size();
380                 cuda_assert(cuMemAlloc(&device_pointer, size))
381                 mem.device_pointer = (device_ptr)device_pointer;
382                 stats.mem_alloc(size);
383                 cuda_pop_context();
384         }
385
386         void mem_copy_to(device_memory& mem)
387         {
388                 cuda_push_context();
389                 if(mem.device_pointer)
390                         cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
391                 cuda_pop_context();
392         }
393
394         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
395         {
396                 size_t offset = elem*y*w;
397                 size_t size = elem*w*h;
398
399                 cuda_push_context();
400                 if(mem.device_pointer) {
401                         cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
402                                 (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
403                 }
404                 else {
405                         memset((char*)mem.data_pointer + offset, 0, size);
406                 }
407                 cuda_pop_context();
408         }
409
410         void mem_zero(device_memory& mem)
411         {
412                 memset((void*)mem.data_pointer, 0, mem.memory_size());
413
414                 cuda_push_context();
415                 if(mem.device_pointer)
416                         cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
417                 cuda_pop_context();
418         }
419
420         void mem_free(device_memory& mem)
421         {
422                 if(mem.device_pointer) {
423                         cuda_push_context();
424                         cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
425                         cuda_pop_context();
426
427                         mem.device_pointer = 0;
428
429                         stats.mem_free(mem.memory_size());
430                 }
431         }
432
433         void const_copy_to(const char *name, void *host, size_t size)
434         {
435                 CUdeviceptr mem;
436                 size_t bytes;
437
438                 cuda_push_context();
439                 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
440                 //assert(bytes == size);
441                 cuda_assert(cuMemcpyHtoD(mem, host, size))
442                 cuda_pop_context();
443         }
444
445         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
446         {
447                 /* determine format */
448                 CUarray_format_enum format;
449                 size_t dsize = datatype_size(mem.data_type);
450                 size_t size = mem.memory_size();
451
452                 switch(mem.data_type) {
453                         case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
454                         case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
455                         case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
456                         case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
457                         default: assert(0); return;
458                 }
459
460                 CUtexref texref = NULL;
461
462                 cuda_push_context();
463                 cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
464
465                 if(!texref) {
466                         cuda_pop_context();
467                         return;
468                 }
469
470                 if(interpolation) {
471                         CUarray handle = NULL;
472                         CUDA_ARRAY_DESCRIPTOR desc;
473
474                         desc.Width = mem.data_width;
475                         desc.Height = mem.data_height;
476                         desc.Format = format;
477                         desc.NumChannels = mem.data_elements;
478
479                         cuda_assert(cuArrayCreate(&handle, &desc))
480
481                         if(!handle) {
482                                 cuda_pop_context();
483                                 return;
484                         }
485
486                         if(mem.data_height > 1) {
487                                 CUDA_MEMCPY2D param;
488                                 memset(&param, 0, sizeof(param));
489                                 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
490                                 param.dstArray = handle;
491                                 param.srcMemoryType = CU_MEMORYTYPE_HOST;
492                                 param.srcHost = (void*)mem.data_pointer;
493                                 param.srcPitch = mem.data_width*dsize*mem.data_elements;
494                                 param.WidthInBytes = param.srcPitch;
495                                 param.Height = mem.data_height;
496
497                                 cuda_assert(cuMemcpy2D(&param))
498                         }
499                         else
500                                 cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
501
502                         cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
503
504                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
505                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
506
507                         mem.device_pointer = (device_ptr)handle;
508
509                         stats.mem_alloc(size);
510                 }
511                 else {
512                         cuda_pop_context();
513
514                         mem_alloc(mem, MEM_READ_ONLY);
515                         mem_copy_to(mem);
516
517                         cuda_push_context();
518
519                         cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
520                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
521                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
522                 }
523
524                 if(periodic) {
525                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
526                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
527                 }
528                 else {
529                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
530                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
531                 }
532                 cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
533
534                 cuda_pop_context();
535
536                 tex_interp_map[mem.device_pointer] = interpolation;
537         }
538
539         void tex_free(device_memory& mem)
540         {
541                 if(mem.device_pointer) {
542                         if(tex_interp_map[mem.device_pointer]) {
543                                 cuda_push_context();
544                                 cuArrayDestroy((CUarray)mem.device_pointer);
545                                 cuda_pop_context();
546
547                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
548                                 mem.device_pointer = 0;
549
550                                 stats.mem_free(mem.memory_size());
551                         }
552                         else {
553                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
554                                 mem_free(mem);
555                         }
556                 }
557         }
558
559         void path_trace(RenderTile& rtile, int sample, bool branched)
560         {
561                 if(have_error())
562                         return;
563
564                 cuda_push_context();
565
566                 CUfunction cuPathTrace;
567                 CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
568                 CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
569
570                 /* get kernel function */
571                 if(branched)
572                         cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"))
573                 else
574                         cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
575
576                 if(have_error())
577                         return;
578         
579                 /* pass in parameters */
580                 int offset = 0;
581                 
582                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
583                 offset += sizeof(d_buffer);
584
585                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
586                 offset += sizeof(d_rng_state);
587
588                 offset = align_up(offset, __alignof(sample));
589
590                 cuda_assert(cuParamSeti(cuPathTrace, offset, sample))
591                 offset += sizeof(sample);
592
593                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x))
594                 offset += sizeof(rtile.x);
595
596                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y))
597                 offset += sizeof(rtile.y);
598
599                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w))
600                 offset += sizeof(rtile.w);
601
602                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h))
603                 offset += sizeof(rtile.h);
604
605                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset))
606                 offset += sizeof(rtile.offset);
607
608                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride))
609                 offset += sizeof(rtile.stride);
610
611                 cuda_assert(cuParamSetSize(cuPathTrace, offset))
612
613                 /* launch kernel: todo find optimal size, cache config for fermi */
614                 int xthreads = 16;
615                 int ythreads = 16;
616                 int xblocks = (rtile.w + xthreads - 1)/xthreads;
617                 int yblocks = (rtile.h + ythreads - 1)/ythreads;
618
619                 cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
620                 cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
621                 cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
622
623                 cuda_assert(cuCtxSynchronize())
624
625                 cuda_pop_context();
626         }
627
628         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
629         {
630                 if(have_error())
631                         return;
632
633                 cuda_push_context();
634
635                 CUfunction cuFilmConvert;
636                 CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
637                 CUdeviceptr d_buffer = cuda_device_ptr(buffer);
638
639                 /* get kernel function */
640                 if(rgba_half)
641                         cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"))
642                 else
643                         cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"))
644
645                 /* pass in parameters */
646                 int offset = 0;
647
648                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
649                 offset += sizeof(d_rgba);
650                 
651                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
652                 offset += sizeof(d_buffer);
653
654                 float sample_scale = 1.0f/(task.sample + 1);
655                 offset = align_up(offset, __alignof(sample_scale));
656
657                 cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale))
658                 offset += sizeof(sample_scale);
659
660                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
661                 offset += sizeof(task.x);
662
663                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
664                 offset += sizeof(task.y);
665
666                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
667                 offset += sizeof(task.w);
668
669                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
670                 offset += sizeof(task.h);
671
672                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
673                 offset += sizeof(task.offset);
674
675                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
676                 offset += sizeof(task.stride);
677
678                 cuda_assert(cuParamSetSize(cuFilmConvert, offset))
679
680                 /* launch kernel: todo find optimal size, cache config for fermi */
681                 int xthreads = 16;
682                 int ythreads = 16;
683                 int xblocks = (task.w + xthreads - 1)/xthreads;
684                 int yblocks = (task.h + ythreads - 1)/ythreads;
685
686                 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
687                 cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
688                 cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
689
690                 unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
691
692                 cuda_pop_context();
693         }
694
695         void shader(DeviceTask& task)
696         {
697                 if(have_error())
698                         return;
699
700                 cuda_push_context();
701
702                 CUfunction cuDisplace;
703                 CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
704                 CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
705
706                 /* get kernel function */
707                 cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
708                 
709                 /* pass in parameters */
710                 int offset = 0;
711                 
712                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
713                 offset += sizeof(d_input);
714
715                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_output, sizeof(d_output)))
716                 offset += sizeof(d_output);
717
718                 int shader_eval_type = task.shader_eval_type;
719                 offset = align_up(offset, __alignof(shader_eval_type));
720
721                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
722                 offset += sizeof(task.shader_eval_type);
723
724                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
725                 offset += sizeof(task.shader_x);
726
727                 cuda_assert(cuParamSetSize(cuDisplace, offset))
728
729                 /* launch kernel: todo find optimal size, cache config for fermi */
730                 int xthreads = 16;
731                 int xblocks = (task.shader_w + xthreads - 1)/xthreads;
732
733                 cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
734                 cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
735                 cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
736
737                 cuda_pop_context();
738         }
739
740         CUdeviceptr map_pixels(device_ptr mem)
741         {
742                 if(!background) {
743                         PixelMem pmem = pixel_mem_map[mem];
744                         CUdeviceptr buffer;
745                         
746                         size_t bytes;
747                         cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
748                         cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
749                         
750                         return buffer;
751                 }
752
753                 return cuda_device_ptr(mem);
754         }
755
756         void unmap_pixels(device_ptr mem)
757         {
758                 if(!background) {
759                         PixelMem pmem = pixel_mem_map[mem];
760
761                         cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
762                 }
763         }
764
765         void pixels_alloc(device_memory& mem)
766         {
767                 if(!background) {
768                         PixelMem pmem;
769
770                         pmem.w = mem.data_width;
771                         pmem.h = mem.data_height;
772
773                         cuda_push_context();
774
775                         glGenBuffers(1, &pmem.cuPBO);
776                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
777                         if(mem.data_type == TYPE_HALF)
778                                 glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
779                         else
780                                 glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
781                         
782                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
783                         
784                         glGenTextures(1, &pmem.cuTexId);
785                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
786                         if(mem.data_type == TYPE_HALF)
787                                 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
788                         else
789                                 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
790                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
791                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
792                         glBindTexture(GL_TEXTURE_2D, 0);
793                         
794                         CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
795
796                         if(result == CUDA_SUCCESS) {
797                                 cuda_pop_context();
798
799                                 mem.device_pointer = pmem.cuTexId;
800                                 pixel_mem_map[mem.device_pointer] = pmem;
801
802                                 stats.mem_alloc(mem.memory_size());
803
804                                 return;
805                         }
806                         else {
807                                 /* failed to register buffer, fallback to no interop */
808                                 glDeleteBuffers(1, &pmem.cuPBO);
809                                 glDeleteTextures(1, &pmem.cuTexId);
810
811                                 cuda_pop_context();
812
813                                 background = true;
814                         }
815                 }
816
817                 Device::pixels_alloc(mem);
818         }
819
820         void pixels_copy_from(device_memory& mem, int y, int w, int h)
821         {
822                 if(!background) {
823                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
824
825                         cuda_push_context();
826
827                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
828                         uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
829                         size_t offset = sizeof(uchar)*4*y*w;
830                         memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
831                         glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
832                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
833
834                         cuda_pop_context();
835
836                         return;
837                 }
838
839                 Device::pixels_copy_from(mem, y, w, h);
840         }
841
842         void pixels_free(device_memory& mem)
843         {
844                 if(mem.device_pointer) {
845                         if(!background) {
846                                 PixelMem pmem = pixel_mem_map[mem.device_pointer];
847
848                                 cuda_push_context();
849
850                                 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
851                                 glDeleteBuffers(1, &pmem.cuPBO);
852                                 glDeleteTextures(1, &pmem.cuTexId);
853
854                                 cuda_pop_context();
855
856                                 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
857                                 mem.device_pointer = 0;
858
859                                 stats.mem_free(mem.memory_size());
860
861                                 return;
862                         }
863
864                         Device::pixels_free(mem);
865                 }
866         }
867
868         void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
869         {
870                 if(!background) {
871                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
872
873                         cuda_push_context();
874
875                         /* for multi devices, this assumes the ineffecient method that we allocate
876                          * all pixels on the device even though we only render to a subset */
877                         size_t offset = 4*y*w;
878
879                         if(mem.data_type == TYPE_HALF)
880                                 offset *= sizeof(GLhalf);
881                         else
882                                 offset *= sizeof(uint8_t);
883
884                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
885                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
886                         if(mem.data_type == TYPE_HALF)
887                                 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset);
888                         else
889                                 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
890                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
891                         
892                         glEnable(GL_TEXTURE_2D);
893                         
894                         if(transparent) {
895                                 glEnable(GL_BLEND);
896                                 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
897                         }
898
899                         glColor3f(1.0f, 1.0f, 1.0f);
900
901                         glPushMatrix();
902                         glTranslatef(0.0f, (float)dy, 0.0f);
903                                 
904                         glBegin(GL_QUADS);
905                         
906                         glTexCoord2f(0.0f, 0.0f);
907                         glVertex2f(0.0f, 0.0f);
908                         glTexCoord2f((float)w/(float)pmem.w, 0.0f);
909                         glVertex2f((float)width, 0.0f);
910                         glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
911                         glVertex2f((float)width, (float)height);
912                         glTexCoord2f(0.0f, (float)h/(float)pmem.h);
913                         glVertex2f(0.0f, (float)height);
914
915                         glEnd();
916
917                         glPopMatrix();
918
919                         if(transparent)
920                                 glDisable(GL_BLEND);
921                         
922                         glBindTexture(GL_TEXTURE_2D, 0);
923                         glDisable(GL_TEXTURE_2D);
924
925                         cuda_pop_context();
926
927                         return;
928                 }
929
930                 Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
931         }
932
933         void thread_run(DeviceTask *task)
934         {
935                 if(task->type == DeviceTask::PATH_TRACE) {
936                         RenderTile tile;
937                         
938                         bool branched = task->integrator_branched;
939                         
940                         /* keep rendering tiles until done */
941                         while(task->acquire_tile(this, tile)) {
942                                 int start_sample = tile.start_sample;
943                                 int end_sample = tile.start_sample + tile.num_samples;
944
945                                 for(int sample = start_sample; sample < end_sample; sample++) {
946                                         if (task->get_cancel()) {
947                                                 if(task->need_finish_queue == false)
948                                                         break;
949                                         }
950
951                                         path_trace(tile, sample, branched);
952
953                                         tile.sample = sample + 1;
954
955                                         task->update_progress(tile);
956                                 }
957
958                                 task->release_tile(tile);
959                         }
960                 }
961                 else if(task->type == DeviceTask::SHADER) {
962                         shader(*task);
963
964                         cuda_push_context();
965                         cuda_assert(cuCtxSynchronize())
966                         cuda_pop_context();
967                 }
968         }
969
970         class CUDADeviceTask : public DeviceTask {
971         public:
972                 CUDADeviceTask(CUDADevice *device, DeviceTask& task)
973                 : DeviceTask(task)
974                 {
975                         run = function_bind(&CUDADevice::thread_run, device, this);
976                 }
977         };
978
979         void task_add(DeviceTask& task)
980         {
981                 if(task.type == DeviceTask::FILM_CONVERT) {
982                         /* must be done in main thread due to opengl access */
983                         film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
984
985                         cuda_push_context();
986                         cuda_assert(cuCtxSynchronize())
987                         cuda_pop_context();
988                 }
989                 else {
990                         task_pool.push(new CUDADeviceTask(this, task));
991                 }
992         }
993
994         void task_wait()
995         {
996                 task_pool.wait();
997         }
998
999         void task_cancel()
1000         {
1001                 task_pool.cancel();
1002         }
1003 };
1004
1005 Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
1006 {
1007         return new CUDADevice(info, stats, background);
1008 }
1009
1010 void device_cuda_info(vector<DeviceInfo>& devices)
1011 {
1012         CUresult result;
1013         int count = 0;
1014
1015         result = cuInit(0);
1016         if(result != CUDA_SUCCESS) {
1017                 if(result != CUDA_ERROR_NO_DEVICE)
1018                         fprintf(stderr, "CUDA cuInit: %s\n", CUDADevice::cuda_error_string(result));
1019                 return;
1020         }
1021
1022         result = cuDeviceGetCount(&count);
1023         if(result != CUDA_SUCCESS) {
1024                 fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", CUDADevice::cuda_error_string(result));
1025                 return;
1026         }
1027         
1028         vector<DeviceInfo> display_devices;
1029         
1030         for(int num = 0; num < count; num++) {
1031                 char name[256];
1032                 int attr;
1033                 
1034                 if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
1035                         continue;
1036
1037                 DeviceInfo info;
1038
1039                 info.type = DEVICE_CUDA;
1040                 info.description = string(name);
1041                 info.id = string_printf("CUDA_%d", num);
1042                 info.num = num;
1043
1044                 int major, minor;
1045                 cuDeviceComputeCapability(&major, &minor, num);
1046                 info.advanced_shading = (major >= 2);
1047                 info.pack_images = false;
1048
1049                 /* if device has a kernel timeout, assume it is used for display */
1050                 if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
1051                         info.display_device = true;
1052                         display_devices.push_back(info);
1053                 }
1054                 else
1055                         devices.push_back(info);
1056         }
1057
1058         if(!display_devices.empty())
1059                 devices.insert(devices.end(), display_devices.begin(), display_devices.end());
1060 }
1061
1062 CCL_NAMESPACE_END
1063