Remove usage WITH_CYCLES_CUDA_BINARIES in code, use check for
[blender.git] / intern / cycles / device / device_cuda.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "device.h"
24 #include "device_intern.h"
25
26 #include "buffers.h"
27
28 #include "util_cuda.h"
29 #include "util_debug.h"
30 #include "util_map.h"
31 #include "util_opengl.h"
32 #include "util_path.h"
33 #include "util_system.h"
34 #include "util_types.h"
35 #include "util_time.h"
36
37 CCL_NAMESPACE_BEGIN
38
39 class CUDADevice : public Device
40 {
41 public:
42         TaskPool task_pool;
43         CUdevice cuDevice;
44         CUcontext cuContext;
45         CUmodule cuModule;
46         map<device_ptr, bool> tex_interp_map;
47         int cuDevId;
48
49         struct PixelMem {
50                 GLuint cuPBO;
51                 CUgraphicsResource cuPBOresource;
52                 GLuint cuTexId;
53                 int w, h;
54         };
55
56         map<device_ptr, PixelMem> pixel_mem_map;
57
58         CUdeviceptr cuda_device_ptr(device_ptr mem)
59         {
60                 return (CUdeviceptr)mem;
61         }
62
63         static const char *cuda_error_string(CUresult result)
64         {
65                 switch(result) {
66                         case CUDA_SUCCESS: return "No errors";
67                         case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
68                         case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
69                         case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
70                         case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
71
72                         case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
73                         case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
74
75                         case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
76                         case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
77                         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
78                         case CUDA_ERROR_MAP_FAILED: return "Map failed";
79                         case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
80                         case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
81                         case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
82                         case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
83                         case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
84                         case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
85                         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
86                         case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
87                         case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
88                         case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
89
90                         case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
91                         case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
92                         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
93                         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
94
95                         case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
96
97                         case CUDA_ERROR_NOT_FOUND: return "Not found";
98
99                         case CUDA_ERROR_NOT_READY: return "CUDA not ready";
100
101                         case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
102                         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
103                         case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
104                         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
105
106                         case CUDA_ERROR_UNKNOWN: return "Unknown error";
107
108                         default: return "Unknown CUDA error value";
109                 }
110         }
111
112 #ifdef NDEBUG
113 #define cuda_abort()
114 #else
115 #define cuda_abort() abort()
116 #endif
117
118 #define cuda_assert(stmt) \
119         { \
120                 CUresult result = stmt; \
121                 \
122                 if(result != CUDA_SUCCESS) { \
123                         string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
124                         if(error_msg == "") \
125                                 error_msg = message; \
126                         fprintf(stderr, "%s\n", message.c_str()); \
127                         /*cuda_abort();*/ \
128                 } \
129         }
130
131         bool cuda_error(CUresult result)
132         {
133                 if(result == CUDA_SUCCESS)
134                         return false;
135
136                 string message = string_printf("CUDA error: %s", cuda_error_string(result));
137                 if(error_msg == "")
138                         error_msg = message;
139                 fprintf(stderr, "%s\n", message.c_str());
140                 return true;
141         }
142
143         void cuda_error(const string& message)
144         {
145                 if(error_msg == "")
146                         error_msg = message;
147                 fprintf(stderr, "%s\n", message.c_str());
148         }
149
150         void cuda_push_context()
151         {
152                 cuda_assert(cuCtxSetCurrent(cuContext))
153         }
154
155         void cuda_pop_context()
156         {
157                 cuda_assert(cuCtxSetCurrent(NULL));
158         }
159
160         CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats)
161         {
162                 background = background_;
163
164                 cuDevId = info.num;
165                 cuDevice = 0;
166                 cuContext = 0;
167
168                 /* intialize */
169                 if(cuda_error(cuInit(0)))
170                         return;
171
172                 /* setup device and context */
173                 if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
174                         return;
175
176                 CUresult result;
177
178                 if(background) {
179                         result = cuCtxCreate(&cuContext, 0, cuDevice);
180                 }
181                 else {
182                         result = cuGLCtxCreate(&cuContext, 0, cuDevice);
183
184                         if(result != CUDA_SUCCESS) {
185                                 result = cuCtxCreate(&cuContext, 0, cuDevice);
186                                 background = true;
187                         }
188                 }
189
190                 if(cuda_error(result))
191                         return;
192
193                 cuda_pop_context();
194         }
195
196         ~CUDADevice()
197         {
198                 task_pool.stop();
199
200                 cuda_push_context();
201                 cuda_assert(cuCtxDetach(cuContext))
202         }
203
204         bool support_device(bool experimental)
205         {
206                 if(!experimental) {
207                         int major, minor;
208                         cuDeviceComputeCapability(&major, &minor, cuDevId);
209
210                         if(major <= 1 && minor <= 2) {
211                                 cuda_error(string_printf("CUDA device supported only with compute capability 1.3 or up, found %d.%d.", major, minor));
212                                 return false;
213                         }
214                 }
215
216                 return true;
217         }
218
219         string compile_kernel()
220         {
221                 /* compute cubin name */
222                 int major, minor;
223                 cuDeviceComputeCapability(&major, &minor, cuDevId);
224
225                 /* attempt to use kernel provided with blender */
226                 string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
227                 if(path_exists(cubin))
228                         return cubin;
229
230                 /* not found, try to use locally compiled kernel */
231                 string kernel_path = path_get("kernel");
232                 string md5 = path_files_md5_hash(kernel_path);
233
234                 cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
235                 cubin = path_user_get(path_join("cache", cubin));
236
237                 /* if exists already, use it */
238                 if(path_exists(cubin))
239                         return cubin;
240
241 #ifdef _WIN32
242                 if(cuHavePrecompiledKernels()) {
243                         if(major <= 1 && minor <= 2)
244                                 cuda_error(string_printf("CUDA device supported only compute capability 1.3 or up, found %d.%d.", major, minor));
245                         else
246                                 cuda_error(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
247                         return "";
248                 }
249 #endif
250
251                 /* if not, find CUDA compiler */
252                 string nvcc = cuCompilerPath();
253
254                 if(nvcc == "") {
255                         cuda_error("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
256                         return "";
257                 }
258
259                 /* compile */
260                 string kernel = path_join(kernel_path, "kernel.cu");
261                 string include = kernel_path;
262                 const int machine = system_cpu_bits();
263                 const int maxreg = 24;
264
265                 double starttime = time_dt();
266                 printf("Compiling CUDA kernel ...\n");
267
268                 path_create_directories(cubin);
269
270                 string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
271                         "-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
272                         nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
273
274                 if(system(command.c_str()) == -1) {
275                         cuda_error("Failed to execute compilation command, see console for details.");
276                         return "";
277                 }
278
279                 /* verify if compilation succeeded */
280                 if(!path_exists(cubin)) {
281                         cuda_error("CUDA kernel compilation failed, see console for details.");
282                         return "";
283                 }
284
285                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
286
287                 return cubin;
288         }
289
290         bool load_kernels(bool experimental)
291         {
292                 /* check if cuda init succeeded */
293                 if(cuContext == 0)
294                         return false;
295
296                 if(!support_device(experimental))
297                         return false;
298
299                 /* get kernel */
300                 string cubin = compile_kernel();
301
302                 if(cubin == "")
303                         return false;
304
305                 /* open module */
306                 cuda_push_context();
307
308                 CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
309                 if(cuda_error(result))
310                         cuda_error(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
311
312                 cuda_pop_context();
313
314                 return (result == CUDA_SUCCESS);
315         }
316
317         void mem_alloc(device_memory& mem, MemoryType type)
318         {
319                 cuda_push_context();
320                 CUdeviceptr device_pointer;
321                 size_t size = mem.memory_size();
322                 cuda_assert(cuMemAlloc(&device_pointer, size))
323                 mem.device_pointer = (device_ptr)device_pointer;
324                 stats.mem_alloc(size);
325                 cuda_pop_context();
326         }
327
328         void mem_copy_to(device_memory& mem)
329         {
330                 cuda_push_context();
331                 if(mem.device_pointer)
332                         cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
333                 cuda_pop_context();
334         }
335
336         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
337         {
338                 size_t offset = elem*y*w;
339                 size_t size = elem*w*h;
340
341                 cuda_push_context();
342                 if(mem.device_pointer) {
343                         cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
344                                 (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
345                 }
346                 else {
347                         memset((char*)mem.data_pointer + offset, 0, size);
348                 }
349                 cuda_pop_context();
350         }
351
352         void mem_zero(device_memory& mem)
353         {
354                 memset((void*)mem.data_pointer, 0, mem.memory_size());
355
356                 cuda_push_context();
357                 if(mem.device_pointer)
358                         cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
359                 cuda_pop_context();
360         }
361
362         void mem_free(device_memory& mem)
363         {
364                 if(mem.device_pointer) {
365                         cuda_push_context();
366                         cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
367                         cuda_pop_context();
368
369                         mem.device_pointer = 0;
370
371                         stats.mem_free(mem.memory_size());
372                 }
373         }
374
375         void const_copy_to(const char *name, void *host, size_t size)
376         {
377                 CUdeviceptr mem;
378                 size_t bytes;
379
380                 cuda_push_context();
381                 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
382                 //assert(bytes == size);
383                 cuda_assert(cuMemcpyHtoD(mem, host, size))
384                 cuda_pop_context();
385         }
386
387         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
388         {
389                 /* determine format */
390                 CUarray_format_enum format;
391                 size_t dsize = datatype_size(mem.data_type);
392                 size_t size = mem.memory_size();
393
394                 switch(mem.data_type) {
395                         case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
396                         case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
397                         case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
398                         case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
399                         default: assert(0); return;
400                 }
401
402                 CUtexref texref = NULL;
403
404                 cuda_push_context();
405                 cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
406
407                 if(!texref) {
408                         cuda_pop_context();
409                         return;
410                 }
411
412                 if(interpolation) {
413                         CUarray handle = NULL;
414                         CUDA_ARRAY_DESCRIPTOR desc;
415
416                         desc.Width = mem.data_width;
417                         desc.Height = mem.data_height;
418                         desc.Format = format;
419                         desc.NumChannels = mem.data_elements;
420
421                         cuda_assert(cuArrayCreate(&handle, &desc))
422
423                         if(!handle) {
424                                 cuda_pop_context();
425                                 return;
426                         }
427
428                         if(mem.data_height > 1) {
429                                 CUDA_MEMCPY2D param;
430                                 memset(&param, 0, sizeof(param));
431                                 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
432                                 param.dstArray = handle;
433                                 param.srcMemoryType = CU_MEMORYTYPE_HOST;
434                                 param.srcHost = (void*)mem.data_pointer;
435                                 param.srcPitch = mem.data_width*dsize*mem.data_elements;
436                                 param.WidthInBytes = param.srcPitch;
437                                 param.Height = mem.data_height;
438
439                                 cuda_assert(cuMemcpy2D(&param))
440                         }
441                         else
442                                 cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
443
444                         cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
445
446                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
447                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
448
449                         mem.device_pointer = (device_ptr)handle;
450
451                         stats.mem_alloc(size);
452                 }
453                 else {
454                         cuda_pop_context();
455
456                         mem_alloc(mem, MEM_READ_ONLY);
457                         mem_copy_to(mem);
458
459                         cuda_push_context();
460
461                         cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
462                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
463                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
464                 }
465
466                 if(periodic) {
467                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
468                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
469                 }
470                 else {
471                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
472                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
473                 }
474                 cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
475
476                 cuda_pop_context();
477
478                 tex_interp_map[mem.device_pointer] = interpolation;
479         }
480
481         void tex_free(device_memory& mem)
482         {
483                 if(mem.device_pointer) {
484                         if(tex_interp_map[mem.device_pointer]) {
485                                 cuda_push_context();
486                                 cuArrayDestroy((CUarray)mem.device_pointer);
487                                 cuda_pop_context();
488
489                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
490                                 mem.device_pointer = 0;
491
492                                 stats.mem_free(mem.memory_size());
493                         }
494                         else {
495                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
496                                 mem_free(mem);
497                         }
498                 }
499         }
500
501         void path_trace(RenderTile& rtile, int sample)
502         {
503                 if(have_error())
504                         return;
505
506                 cuda_push_context();
507
508                 CUfunction cuPathTrace;
509                 CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
510                 CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
511
512                 /* get kernel function */
513                 cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
514                 
515                 /* pass in parameters */
516                 int offset = 0;
517                 
518                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
519                 offset += sizeof(d_buffer);
520
521                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
522                 offset += sizeof(d_rng_state);
523
524                 offset = align_up(offset, __alignof(sample));
525
526                 cuda_assert(cuParamSeti(cuPathTrace, offset, sample))
527                 offset += sizeof(sample);
528
529                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x))
530                 offset += sizeof(rtile.x);
531
532                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y))
533                 offset += sizeof(rtile.y);
534
535                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w))
536                 offset += sizeof(rtile.w);
537
538                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h))
539                 offset += sizeof(rtile.h);
540
541                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset))
542                 offset += sizeof(rtile.offset);
543
544                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride))
545                 offset += sizeof(rtile.stride);
546
547                 cuda_assert(cuParamSetSize(cuPathTrace, offset))
548
549                 /* launch kernel: todo find optimal size, cache config for fermi */
550 #ifndef __APPLE__
551                 int xthreads = 16;
552                 int ythreads = 16;
553 #else
554                 int xthreads = 8;
555                 int ythreads = 8;
556 #endif
557                 int xblocks = (rtile.w + xthreads - 1)/xthreads;
558                 int yblocks = (rtile.h + ythreads - 1)/ythreads;
559
560                 cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
561                 cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
562                 cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
563
564                 cuda_assert(cuCtxSynchronize())
565
566                 cuda_pop_context();
567         }
568
569         void tonemap(DeviceTask& task, device_ptr buffer, device_ptr rgba)
570         {
571                 if(have_error())
572                         return;
573
574                 cuda_push_context();
575
576                 CUfunction cuFilmConvert;
577                 CUdeviceptr d_rgba = map_pixels(rgba);
578                 CUdeviceptr d_buffer = cuda_device_ptr(buffer);
579
580                 /* get kernel function */
581                 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
582
583                 /* pass in parameters */
584                 int offset = 0;
585
586                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
587                 offset += sizeof(d_rgba);
588                 
589                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
590                 offset += sizeof(d_buffer);
591
592                 int sample = task.sample;
593                 offset = align_up(offset, __alignof(sample));
594
595                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.sample))
596                 offset += sizeof(task.sample);
597
598                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
599                 offset += sizeof(task.resolution);
600
601                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
602                 offset += sizeof(task.x);
603
604                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
605                 offset += sizeof(task.y);
606
607                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
608                 offset += sizeof(task.w);
609
610                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
611                 offset += sizeof(task.h);
612
613                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
614                 offset += sizeof(task.offset);
615
616                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
617                 offset += sizeof(task.stride);
618
619                 cuda_assert(cuParamSetSize(cuFilmConvert, offset))
620
621                 /* launch kernel: todo find optimal size, cache config for fermi */
622 #ifndef __APPLE__
623                 int xthreads = 16;
624                 int ythreads = 16;
625 #else
626                 int xthreads = 8;
627                 int ythreads = 8;
628 #endif
629                 int xblocks = (task.w + xthreads - 1)/xthreads;
630                 int yblocks = (task.h + ythreads - 1)/ythreads;
631
632                 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
633                 cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
634                 cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
635
636                 unmap_pixels(task.rgba);
637
638                 cuda_pop_context();
639         }
640
641         void shader(DeviceTask& task)
642         {
643                 if(have_error())
644                         return;
645
646                 cuda_push_context();
647
648                 CUfunction cuDisplace;
649                 CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
650                 CUdeviceptr d_offset = cuda_device_ptr(task.shader_output);
651
652                 /* get kernel function */
653                 cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
654                 
655                 /* pass in parameters */
656                 int offset = 0;
657                 
658                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
659                 offset += sizeof(d_input);
660
661                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
662                 offset += sizeof(d_offset);
663
664                 int shader_eval_type = task.shader_eval_type;
665                 offset = align_up(offset, __alignof(shader_eval_type));
666
667                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
668                 offset += sizeof(task.shader_eval_type);
669
670                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
671                 offset += sizeof(task.shader_x);
672
673                 cuda_assert(cuParamSetSize(cuDisplace, offset))
674
675                 /* launch kernel: todo find optimal size, cache config for fermi */
676 #ifndef __APPLE__
677                 int xthreads = 16;
678 #else
679                 int xthreads = 8;
680 #endif
681                 int xblocks = (task.shader_w + xthreads - 1)/xthreads;
682
683                 cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
684                 cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
685                 cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
686
687                 cuda_pop_context();
688         }
689
690         CUdeviceptr map_pixels(device_ptr mem)
691         {
692                 if(!background) {
693                         PixelMem pmem = pixel_mem_map[mem];
694                         CUdeviceptr buffer;
695                         
696                         size_t bytes;
697                         cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
698                         cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
699                         
700                         return buffer;
701                 }
702
703                 return cuda_device_ptr(mem);
704         }
705
706         void unmap_pixels(device_ptr mem)
707         {
708                 if(!background) {
709                         PixelMem pmem = pixel_mem_map[mem];
710
711                         cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
712                 }
713         }
714
715         void pixels_alloc(device_memory& mem)
716         {
717                 if(!background) {
718                         PixelMem pmem;
719
720                         pmem.w = mem.data_width;
721                         pmem.h = mem.data_height;
722
723                         cuda_push_context();
724
725                         glGenBuffers(1, &pmem.cuPBO);
726                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
727                         glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
728                         
729                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
730                         
731                         glGenTextures(1, &pmem.cuTexId);
732                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
733                         glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
734                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
735                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
736                         glBindTexture(GL_TEXTURE_2D, 0);
737                         
738                         CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
739
740                         if(!cuda_error(result)) {
741                                 cuda_pop_context();
742
743                                 mem.device_pointer = pmem.cuTexId;
744                                 pixel_mem_map[mem.device_pointer] = pmem;
745
746                                 stats.mem_alloc(mem.memory_size());
747
748                                 return;
749                         }
750                         else {
751                                 /* failed to register buffer, fallback to no interop */
752                                 glDeleteBuffers(1, &pmem.cuPBO);
753                                 glDeleteTextures(1, &pmem.cuTexId);
754
755                                 cuda_pop_context();
756
757                                 background = true;
758                         }
759                 }
760
761                 Device::pixels_alloc(mem);
762         }
763
764         void pixels_copy_from(device_memory& mem, int y, int w, int h)
765         {
766                 if(!background) {
767                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
768
769                         cuda_push_context();
770
771                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
772                         uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
773                         size_t offset = sizeof(uchar)*4*y*w;
774                         memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
775                         glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
776                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
777
778                         cuda_pop_context();
779
780                         return;
781                 }
782
783                 Device::pixels_copy_from(mem, y, w, h);
784         }
785
786         void pixels_free(device_memory& mem)
787         {
788                 if(mem.device_pointer) {
789                         if(!background) {
790                                 PixelMem pmem = pixel_mem_map[mem.device_pointer];
791
792                                 cuda_push_context();
793
794                                 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
795                                 glDeleteBuffers(1, &pmem.cuPBO);
796                                 glDeleteTextures(1, &pmem.cuTexId);
797
798                                 cuda_pop_context();
799
800                                 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
801                                 mem.device_pointer = 0;
802
803                                 stats.mem_free(mem.memory_size());
804
805                                 return;
806                         }
807
808                         Device::pixels_free(mem);
809                 }
810         }
811
812         void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
813         {
814                 if(!background) {
815                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
816
817                         cuda_push_context();
818
819                         /* for multi devices, this assumes the ineffecient method that we allocate
820                          * all pixels on the device even though we only render to a subset */
821                         size_t offset = sizeof(uint8_t)*4*y*w;
822
823                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
824                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
825                         glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
826                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
827                         
828                         glEnable(GL_TEXTURE_2D);
829                         
830                         if(transparent) {
831                                 glEnable(GL_BLEND);
832                                 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
833                         }
834
835                         glColor3f(1.0f, 1.0f, 1.0f);
836
837                         glPushMatrix();
838                         glTranslatef(0.0f, (float)dy, 0.0f);
839                                 
840                         glBegin(GL_QUADS);
841                         
842                         glTexCoord2f(0.0f, 0.0f);
843                         glVertex2f(0.0f, 0.0f);
844                         glTexCoord2f((float)w/(float)pmem.w, 0.0f);
845                         glVertex2f((float)width, 0.0f);
846                         glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
847                         glVertex2f((float)width, (float)height);
848                         glTexCoord2f(0.0f, (float)h/(float)pmem.h);
849                         glVertex2f(0.0f, (float)height);
850
851                         glEnd();
852
853                         glPopMatrix();
854
855                         if(transparent)
856                                 glDisable(GL_BLEND);
857                         
858                         glBindTexture(GL_TEXTURE_2D, 0);
859                         glDisable(GL_TEXTURE_2D);
860
861                         cuda_pop_context();
862
863                         return;
864                 }
865
866                 Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
867         }
868
869         void thread_run(DeviceTask *task)
870         {
871                 if(task->type == DeviceTask::PATH_TRACE) {
872                         RenderTile tile;
873                         
874                         /* keep rendering tiles until done */
875                         while(task->acquire_tile(this, tile)) {
876                                 int start_sample = tile.start_sample;
877                                 int end_sample = tile.start_sample + tile.num_samples;
878
879                                 for(int sample = start_sample; sample < end_sample; sample++) {
880                                         if (task->get_cancel()) {
881                                                 if(task->need_finish_queue == false)
882                                                         break;
883                                         }
884
885                                         path_trace(tile, sample);
886
887                                         tile.sample = sample + 1;
888
889                                         task->update_progress(tile);
890                                 }
891
892                                 task->release_tile(tile);
893                         }
894                 }
895                 else if(task->type == DeviceTask::SHADER) {
896                         shader(*task);
897
898                         cuda_push_context();
899                         cuda_assert(cuCtxSynchronize())
900                         cuda_pop_context();
901                 }
902         }
903
904         class CUDADeviceTask : public DeviceTask {
905         public:
906                 CUDADeviceTask(CUDADevice *device, DeviceTask& task)
907                 : DeviceTask(task)
908                 {
909                         run = function_bind(&CUDADevice::thread_run, device, this);
910                 }
911         };
912
913         void task_add(DeviceTask& task)
914         {
915                 if(task.type == DeviceTask::TONEMAP) {
916                         /* must be done in main thread due to opengl access */
917                         tonemap(task, task.buffer, task.rgba);
918
919                         cuda_push_context();
920                         cuda_assert(cuCtxSynchronize())
921                         cuda_pop_context();
922                 }
923                 else {
924                         task_pool.push(new CUDADeviceTask(this, task));
925                 }
926         }
927
928         void task_wait()
929         {
930                 task_pool.wait_work();
931         }
932
933         void task_cancel()
934         {
935                 task_pool.cancel();
936         }
937 };
938
939 Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
940 {
941         return new CUDADevice(info, stats, background);
942 }
943
944 void device_cuda_info(vector<DeviceInfo>& devices)
945 {
946         CUresult result;
947         int count = 0;
948
949         result = cuInit(0);
950         if(result != CUDA_SUCCESS) {
951                 if(result != CUDA_ERROR_NO_DEVICE)
952                         fprintf(stderr, "CUDA cuInit: %s\n", CUDADevice::cuda_error_string(result));
953                 return;
954         }
955
956         result = cuDeviceGetCount(&count);
957         if(result != CUDA_SUCCESS) {
958                 fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", CUDADevice::cuda_error_string(result));
959                 return;
960         }
961         
962         vector<DeviceInfo> display_devices;
963         
964         for(int num = 0; num < count; num++) {
965                 char name[256];
966                 int attr;
967                 
968                 if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
969                         continue;
970
971                 DeviceInfo info;
972
973                 info.type = DEVICE_CUDA;
974                 info.description = string(name);
975                 info.id = string_printf("CUDA_%d", num);
976                 info.num = num;
977
978                 int major, minor;
979                 cuDeviceComputeCapability(&major, &minor, num);
980                 info.advanced_shading = (major >= 2);
981                 info.pack_images = false;
982
983                 /* if device has a kernel timeout, assume it is used for display */
984                 if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
985                         info.display_device = true;
986                         display_devices.push_back(info);
987                 }
988                 else
989                         devices.push_back(info);
990         }
991
992         if(!display_devices.empty())
993                 devices.insert(devices.end(), display_devices.begin(), display_devices.end());
994 }
995
996 CCL_NAMESPACE_END
997