357f99145b218c4b4bdada8b5710fba42936d722
[blender-staging.git] / intern / cycles / device / device_cuda.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "device.h"
24 #include "device_intern.h"
25
26 #include "util_cuda.h"
27 #include "util_debug.h"
28 #include "util_map.h"
29 #include "util_opengl.h"
30 #include "util_path.h"
31 #include "util_system.h"
32 #include "util_types.h"
33 #include "util_time.h"
34
35 CCL_NAMESPACE_BEGIN
36
37 class CUDADevice : public Device
38 {
39 public:
40         CUdevice cuDevice;
41         CUcontext cuContext;
42         CUmodule cuModule;
43         map<device_ptr, bool> tex_interp_map;
44         int cuDevId;
45
46         struct PixelMem {
47                 GLuint cuPBO;
48                 CUgraphicsResource cuPBOresource;
49                 GLuint cuTexId;
50                 int w, h;
51         };
52
53         map<device_ptr, PixelMem> pixel_mem_map;
54
55         CUdeviceptr cuda_device_ptr(device_ptr mem)
56         {
57                 return (CUdeviceptr)mem;
58         }
59
60         const char *cuda_error_string(CUresult result)
61         {
62                 switch(result) {
63                         case CUDA_SUCCESS: return "No errors";
64                         case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
65                         case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
66                         case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
67                         case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
68
69                         case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
70                         case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
71
72                         case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
73                         case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
74                         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
75                         case CUDA_ERROR_MAP_FAILED: return "Map failed";
76                         case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
77                         case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
78                         case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
79                         case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
80                         case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
81                         case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
82                         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
83                         case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
84                         case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
85                         case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
86
87                         case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
88                         case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
89                         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
90                         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
91
92                         case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
93
94                         case CUDA_ERROR_NOT_FOUND: return "Not found";
95
96                         case CUDA_ERROR_NOT_READY: return "CUDA not ready";
97
98                         case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
99                         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
100                         case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
101                         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
102
103                         case CUDA_ERROR_UNKNOWN: return "Unknown error";
104
105                         default: return "Unknown CUDA error value";
106                 }
107         }
108
109 #ifdef NDEBUG
110 #define cuda_abort()
111 #else
112 #define cuda_abort() abort()
113 #endif
114
115 #define cuda_assert(stmt) \
116         { \
117                 CUresult result = stmt; \
118                 \
119                 if(result != CUDA_SUCCESS) { \
120                         string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
121                         if(error_msg == "") \
122                                 error_msg = message; \
123                         fprintf(stderr, "%s\n", message.c_str()); \
124                         cuda_abort(); \
125                 } \
126         }
127
128         bool cuda_error(CUresult result)
129         {
130                 if(result == CUDA_SUCCESS)
131                         return false;
132
133                 string message = string_printf("CUDA error: %s", cuda_error_string(result));
134                 if(error_msg == "")
135                         error_msg = message;
136                 fprintf(stderr, "%s\n", message.c_str());
137                 return true;
138         }
139
140         void cuda_error(const string& message)
141         {
142                 if(error_msg == "")
143                         error_msg = message;
144                 fprintf(stderr, "%s\n", message.c_str());
145         }
146
147         void cuda_push_context()
148         {
149                 cuda_assert(cuCtxSetCurrent(cuContext))
150         }
151
152         void cuda_pop_context()
153         {
154                 cuda_assert(cuCtxSetCurrent(NULL));
155         }
156
157         CUDADevice(DeviceInfo& info, bool background_)
158         {
159                 background = background_;
160
161                 cuDevId = info.num;
162                 cuDevice = 0;
163                 cuContext = 0;
164
165                 /* intialize */
166                 if(cuda_error(cuInit(0)))
167                         return;
168
169                 /* setup device and context */
170                 if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
171                         return;
172
173                 CUresult result;
174
175                 if(background) {
176                         result = cuCtxCreate(&cuContext, 0, cuDevice);
177                 }
178                 else {
179                         result = cuGLCtxCreate(&cuContext, 0, cuDevice);
180
181                         if(result != CUDA_SUCCESS) {
182                                 result = cuCtxCreate(&cuContext, 0, cuDevice);
183                                 background = true;
184                         }
185                 }
186
187                 if(cuda_error(result))
188                         return;
189
190                 cuda_pop_context();
191         }
192
193         ~CUDADevice()
194         {
195                 cuda_push_context();
196                 cuda_assert(cuCtxDetach(cuContext))
197         }
198
199         bool support_device(bool experimental)
200         {
201                 if(!experimental) {
202                         int major, minor;
203                         cuDeviceComputeCapability(&major, &minor, cuDevId);
204
205                         if(major <= 1 && minor <= 2) {
206                                 cuda_error(string_printf("CUDA device supported only with compute capability 1.3 or up, found %d.%d.", major, minor));
207                                 return false;
208                         }
209                 }
210
211                 return true;
212         }
213
214         string compile_kernel()
215         {
216                 /* compute cubin name */
217                 int major, minor;
218                 cuDeviceComputeCapability(&major, &minor, cuDevId);
219
220                 /* attempt to use kernel provided with blender */
221                 string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
222                 if(path_exists(cubin))
223                         return cubin;
224
225                 /* not found, try to use locally compiled kernel */
226                 string kernel_path = path_get("kernel");
227                 string md5 = path_files_md5_hash(kernel_path);
228
229                 cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
230                 cubin = path_user_get(path_join("cache", cubin));
231
232                 /* if exists already, use it */
233                 if(path_exists(cubin))
234                         return cubin;
235
236 #if defined(WITH_CUDA_BINARIES) && defined(_WIN32)
237                 if(major <= 1 && minor <= 2)
238                         cuda_error(string_printf("CUDA device supported only compute capability 1.3 or up, found %d.%d.", major, minor));
239                 else
240                         cuda_error(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
241                 return "";
242 #else
243                 /* if not, find CUDA compiler */
244                 string nvcc = cuCompilerPath();
245
246                 if(nvcc == "") {
247                         cuda_error("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
248                         return "";
249                 }
250
251                 /* compile */
252                 string kernel = path_join(kernel_path, "kernel.cu");
253                 string include = kernel_path;
254                 const int machine = system_cpu_bits();
255                 const int maxreg = 24;
256
257                 double starttime = time_dt();
258                 printf("Compiling CUDA kernel ...\n");
259
260                 path_create_directories(cubin);
261
262                 string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
263                         "-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
264                         nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
265
266                 if(system(command.c_str()) == -1) {
267                         cuda_error("Failed to execute compilation command, see console for details.");
268                         return "";
269                 }
270
271                 /* verify if compilation succeeded */
272                 if(!path_exists(cubin)) {
273                         cuda_error("CUDA kernel compilation failed, see console for details.");
274                         return "";
275                 }
276
277                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
278
279                 return cubin;
280 #endif
281         }
282
283         bool load_kernels(bool experimental)
284         {
285                 /* check if cuda init succeeded */
286                 if(cuContext == 0)
287                         return false;
288
289                 if(!support_device(experimental))
290                         return false;
291
292                 /* get kernel */
293                 string cubin = compile_kernel();
294
295                 if(cubin == "")
296                         return false;
297
298                 /* open module */
299                 cuda_push_context();
300
301                 CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
302                 if(cuda_error(result))
303                         cuda_error(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
304
305                 cuda_pop_context();
306
307                 return (result == CUDA_SUCCESS);
308         }
309
310         void mem_alloc(device_memory& mem, MemoryType type)
311         {
312                 cuda_push_context();
313                 CUdeviceptr device_pointer;
314                 cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
315                 mem.device_pointer = (device_ptr)device_pointer;
316                 cuda_pop_context();
317         }
318
319         void mem_copy_to(device_memory& mem)
320         {
321                 cuda_push_context();
322                 cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
323                 cuda_pop_context();
324         }
325
326         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
327         {
328                 size_t offset = elem*y*w;
329                 size_t size = elem*w*h;
330
331                 cuda_push_context();
332                 cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
333                         (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
334                 cuda_pop_context();
335         }
336
337         void mem_zero(device_memory& mem)
338         {
339                 memset((void*)mem.data_pointer, 0, mem.memory_size());
340
341                 cuda_push_context();
342                 cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
343                 cuda_pop_context();
344         }
345
346         void mem_free(device_memory& mem)
347         {
348                 if(mem.device_pointer) {
349                         cuda_push_context();
350                         cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
351                         cuda_pop_context();
352
353                         mem.device_pointer = 0;
354                 }
355         }
356
357         void const_copy_to(const char *name, void *host, size_t size)
358         {
359                 CUdeviceptr mem;
360                 size_t bytes;
361
362                 cuda_push_context();
363                 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
364                 //assert(bytes == size);
365                 cuda_assert(cuMemcpyHtoD(mem, host, size))
366                 cuda_pop_context();
367         }
368
369         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
370         {
371                 /* determine format */
372                 CUarray_format_enum format;
373                 size_t dsize = datatype_size(mem.data_type);
374                 size_t size = mem.memory_size();
375
376                 switch(mem.data_type) {
377                         case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
378                         case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
379                         case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
380                         case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
381                         default: assert(0); return;
382                 }
383
384                 CUtexref texref;
385
386                 cuda_push_context();
387                 cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
388
389                 if(interpolation) {
390                         CUarray handle;
391                         CUDA_ARRAY_DESCRIPTOR desc;
392
393                         desc.Width = mem.data_width;
394                         desc.Height = mem.data_height;
395                         desc.Format = format;
396                         desc.NumChannels = mem.data_elements;
397
398                         cuda_assert(cuArrayCreate(&handle, &desc))
399
400                         if(mem.data_height > 1) {
401                                 CUDA_MEMCPY2D param;
402                                 memset(&param, 0, sizeof(param));
403                                 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
404                                 param.dstArray = handle;
405                                 param.srcMemoryType = CU_MEMORYTYPE_HOST;
406                                 param.srcHost = (void*)mem.data_pointer;
407                                 param.srcPitch = mem.data_width*dsize*mem.data_elements;
408                                 param.WidthInBytes = param.srcPitch;
409                                 param.Height = mem.data_height;
410
411                                 cuda_assert(cuMemcpy2D(&param))
412                         }
413                         else
414                                 cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
415
416                         cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
417
418                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
419                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
420
421                         mem.device_pointer = (device_ptr)handle;
422                 }
423                 else {
424                         cuda_pop_context();
425
426                         mem_alloc(mem, MEM_READ_ONLY);
427                         mem_copy_to(mem);
428
429                         cuda_push_context();
430
431                         cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
432                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
433                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
434                 }
435
436                 if(periodic) {
437                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
438                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
439                 }
440                 else {
441                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
442                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
443                 }
444                 cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
445
446                 cuda_pop_context();
447
448                 tex_interp_map[mem.device_pointer] = interpolation;
449         }
450
451         void tex_free(device_memory& mem)
452         {
453                 if(mem.device_pointer) {
454                         if(tex_interp_map[mem.device_pointer]) {
455                                 cuda_push_context();
456                                 cuArrayDestroy((CUarray)mem.device_pointer);
457                                 cuda_pop_context();
458
459                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
460                                 mem.device_pointer = 0;
461                         }
462                         else {
463                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
464                                 mem_free(mem);
465                         }
466                 }
467         }
468
469         void path_trace(DeviceTask& task)
470         {
471                 cuda_push_context();
472
473                 CUfunction cuPathTrace;
474                 CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
475                 CUdeviceptr d_rng_state = cuda_device_ptr(task.rng_state);
476
477                 /* get kernel function */
478                 cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
479                 
480                 /* pass in parameters */
481                 int offset = 0;
482                 
483                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
484                 offset += sizeof(d_buffer);
485
486                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
487                 offset += sizeof(d_rng_state);
488
489                 int sample = task.sample;
490                 offset = align_up(offset, __alignof(sample));
491
492                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.sample))
493                 offset += sizeof(task.sample);
494
495                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.x))
496                 offset += sizeof(task.x);
497
498                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.y))
499                 offset += sizeof(task.y);
500
501                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.w))
502                 offset += sizeof(task.w);
503
504                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.h))
505                 offset += sizeof(task.h);
506
507                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.offset))
508                 offset += sizeof(task.offset);
509
510                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.stride))
511                 offset += sizeof(task.stride);
512
513                 cuda_assert(cuParamSetSize(cuPathTrace, offset))
514
515                 /* launch kernel: todo find optimal size, cache config for fermi */
516 #ifndef __APPLE__
517                 int xthreads = 16;
518                 int ythreads = 16;
519 #else
520                 int xthreads = 8;
521                 int ythreads = 8;
522 #endif
523                 int xblocks = (task.w + xthreads - 1)/xthreads;
524                 int yblocks = (task.h + ythreads - 1)/ythreads;
525
526                 cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
527                 cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
528                 cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
529
530                 cuda_pop_context();
531         }
532
533         void tonemap(DeviceTask& task)
534         {
535                 cuda_push_context();
536
537                 CUfunction cuFilmConvert;
538                 CUdeviceptr d_rgba = map_pixels(task.rgba);
539                 CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
540
541                 /* get kernel function */
542                 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
543
544                 /* pass in parameters */
545                 int offset = 0;
546
547                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
548                 offset += sizeof(d_rgba);
549                 
550                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
551                 offset += sizeof(d_buffer);
552
553                 int sample = task.sample;
554                 offset = align_up(offset, __alignof(sample));
555
556                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.sample))
557                 offset += sizeof(task.sample);
558
559                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
560                 offset += sizeof(task.resolution);
561
562                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
563                 offset += sizeof(task.x);
564
565                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
566                 offset += sizeof(task.y);
567
568                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
569                 offset += sizeof(task.w);
570
571                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
572                 offset += sizeof(task.h);
573
574                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
575                 offset += sizeof(task.offset);
576
577                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
578                 offset += sizeof(task.stride);
579
580                 cuda_assert(cuParamSetSize(cuFilmConvert, offset))
581
582                 /* launch kernel: todo find optimal size, cache config for fermi */
583 #ifndef __APPLE__
584                 int xthreads = 16;
585                 int ythreads = 16;
586 #else
587                 int xthreads = 8;
588                 int ythreads = 8;
589 #endif
590                 int xblocks = (task.w + xthreads - 1)/xthreads;
591                 int yblocks = (task.h + ythreads - 1)/ythreads;
592
593                 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
594                 cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
595                 cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
596
597                 unmap_pixels(task.rgba);
598
599                 cuda_pop_context();
600         }
601
602         void shader(DeviceTask& task)
603         {
604                 cuda_push_context();
605
606                 CUfunction cuDisplace;
607                 CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
608                 CUdeviceptr d_offset = cuda_device_ptr(task.shader_output);
609
610                 /* get kernel function */
611                 cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
612                 
613                 /* pass in parameters */
614                 int offset = 0;
615                 
616                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
617                 offset += sizeof(d_input);
618
619                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
620                 offset += sizeof(d_offset);
621
622                 int shader_eval_type = task.shader_eval_type;
623                 offset = align_up(offset, __alignof(shader_eval_type));
624
625                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
626                 offset += sizeof(task.shader_eval_type);
627
628                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
629                 offset += sizeof(task.shader_x);
630
631                 cuda_assert(cuParamSetSize(cuDisplace, offset))
632
633                 /* launch kernel: todo find optimal size, cache config for fermi */
634 #ifndef __APPLE__
635                 int xthreads = 16;
636 #else
637                 int xthreads = 8;
638 #endif
639                 int xblocks = (task.shader_w + xthreads - 1)/xthreads;
640
641                 cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
642                 cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
643                 cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
644
645                 cuda_pop_context();
646         }
647
648         CUdeviceptr map_pixels(device_ptr mem)
649         {
650                 if(!background) {
651                         PixelMem pmem = pixel_mem_map[mem];
652                         CUdeviceptr buffer;
653                         
654                         size_t bytes;
655                         cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
656                         cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
657                         
658                         return buffer;
659                 }
660
661                 return cuda_device_ptr(mem);
662         }
663
664         void unmap_pixels(device_ptr mem)
665         {
666                 if(!background) {
667                         PixelMem pmem = pixel_mem_map[mem];
668
669                         cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
670                 }
671         }
672
673         void pixels_alloc(device_memory& mem)
674         {
675                 if(!background) {
676                         PixelMem pmem;
677
678                         pmem.w = mem.data_width;
679                         pmem.h = mem.data_height;
680
681                         cuda_push_context();
682
683                         glGenBuffers(1, &pmem.cuPBO);
684                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
685                         glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
686                         
687                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
688                         
689                         glGenTextures(1, &pmem.cuTexId);
690                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
691                         glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
692                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
693                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
694                         glBindTexture(GL_TEXTURE_2D, 0);
695                         
696                         CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
697
698                         if(!cuda_error(result)) {
699                                 cuda_pop_context();
700
701                                 mem.device_pointer = pmem.cuTexId;
702                                 pixel_mem_map[mem.device_pointer] = pmem;
703
704                                 return;
705                         }
706                         else {
707                                 /* failed to register buffer, fallback to no interop */
708                                 glDeleteBuffers(1, &pmem.cuPBO);
709                                 glDeleteTextures(1, &pmem.cuTexId);
710
711                                 cuda_pop_context();
712
713                                 background = true;
714                         }
715                 }
716
717                 Device::pixels_alloc(mem);
718         }
719
720         void pixels_copy_from(device_memory& mem, int y, int w, int h)
721         {
722                 if(!background) {
723                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
724
725                         cuda_push_context();
726
727                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
728                         uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
729                         size_t offset = sizeof(uchar)*4*y*w;
730                         memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
731                         glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
732                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
733
734                         cuda_pop_context();
735
736                         return;
737                 }
738
739                 Device::pixels_copy_from(mem, y, w, h);
740         }
741
742         void pixels_free(device_memory& mem)
743         {
744                 if(mem.device_pointer) {
745                         if(!background) {
746                                 PixelMem pmem = pixel_mem_map[mem.device_pointer];
747
748                                 cuda_push_context();
749
750                                 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
751                                 glDeleteBuffers(1, &pmem.cuPBO);
752                                 glDeleteTextures(1, &pmem.cuTexId);
753
754                                 cuda_pop_context();
755
756                                 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
757                                 mem.device_pointer = 0;
758
759                                 return;
760                         }
761
762                         Device::pixels_free(mem);
763                 }
764         }
765
766         void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
767         {
768                 if(!background) {
769                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
770
771                         cuda_push_context();
772
773                         /* for multi devices, this assumes the ineffecient method that we allocate
774                          * all pixels on the device even though we only render to a subset */
775                         size_t offset = sizeof(uint8_t)*4*y*w;
776
777                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
778                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
779                         glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
780                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
781                         
782                         glEnable(GL_TEXTURE_2D);
783                         
784                         if(transparent) {
785                                 glEnable(GL_BLEND);
786                                 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
787                         }
788
789                         glColor3f(1.0f, 1.0f, 1.0f);
790
791                         glPushMatrix();
792                         glTranslatef(0.0f, (float)dy, 0.0f);
793                                 
794                         glBegin(GL_QUADS);
795                         
796                         glTexCoord2f(0.0f, 0.0f);
797                         glVertex2f(0.0f, 0.0f);
798                         glTexCoord2f((float)w/(float)pmem.w, 0.0f);
799                         glVertex2f((float)width, 0.0f);
800                         glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
801                         glVertex2f((float)width, (float)height);
802                         glTexCoord2f(0.0f, (float)h/(float)pmem.h);
803                         glVertex2f(0.0f, (float)height);
804
805                         glEnd();
806
807                         glPopMatrix();
808
809                         if(transparent)
810                                 glDisable(GL_BLEND);
811                         
812                         glBindTexture(GL_TEXTURE_2D, 0);
813                         glDisable(GL_TEXTURE_2D);
814
815                         cuda_pop_context();
816
817                         return;
818                 }
819
820                 Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
821         }
822
823         void task_add(DeviceTask& task)
824         {
825                 if(task.type == DeviceTask::TONEMAP)
826                         tonemap(task);
827                 else if(task.type == DeviceTask::PATH_TRACE)
828                         path_trace(task);
829                 else if(task.type == DeviceTask::SHADER)
830                         shader(task);
831         }
832
833         void task_wait()
834         {
835                 cuda_push_context();
836
837                 cuda_assert(cuCtxSynchronize())
838
839                 cuda_pop_context();
840         }
841
842         void task_cancel()
843         {
844         }
845 };
846
847 Device *device_cuda_create(DeviceInfo& info, bool background)
848 {
849         return new CUDADevice(info, background);
850 }
851
852 void device_cuda_info(vector<DeviceInfo>& devices)
853 {
854         int count = 0;
855
856         if(cuInit(0) != CUDA_SUCCESS)
857                 return;
858         if(cuDeviceGetCount(&count) != CUDA_SUCCESS)
859                 return;
860         
861         vector<DeviceInfo> display_devices;
862         
863         for(int num = 0; num < count; num++) {
864                 char name[256];
865                 int attr;
866                 
867                 if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
868                         continue;
869
870                 DeviceInfo info;
871
872                 info.type = DEVICE_CUDA;
873                 info.description = string(name);
874                 info.id = string_printf("CUDA_%d", num);
875                 info.num = num;
876
877                 int major, minor;
878                 cuDeviceComputeCapability(&major, &minor, num);
879                 info.advanced_shading = (major >= 2);
880                 info.pack_images = false;
881
882                 /* if device has a kernel timeout, assume it is used for display */
883                 if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
884                         info.display_device = true;
885                         display_devices.push_back(info);
886                 }
887                 else
888                         devices.push_back(info);
889         }
890
891         if(!display_devices.empty())
892                 devices.insert(devices.end(), display_devices.begin(), display_devices.end());
893 }
894
895 CCL_NAMESPACE_END
896