Cycles: merge of changes from tomato branch.
[blender-staging.git] / intern / cycles / device / device_cuda.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "device.h"
24 #include "device_intern.h"
25
26 #include "buffers.h"
27
28 #include "util_cuda.h"
29 #include "util_debug.h"
30 #include "util_map.h"
31 #include "util_opengl.h"
32 #include "util_path.h"
33 #include "util_system.h"
34 #include "util_types.h"
35 #include "util_time.h"
36
37 CCL_NAMESPACE_BEGIN
38
39 class CUDADevice : public Device
40 {
41 public:
42         TaskPool task_pool;
43         CUdevice cuDevice;
44         CUcontext cuContext;
45         CUmodule cuModule;
46         map<device_ptr, bool> tex_interp_map;
47         int cuDevId;
48
49         struct PixelMem {
50                 GLuint cuPBO;
51                 CUgraphicsResource cuPBOresource;
52                 GLuint cuTexId;
53                 int w, h;
54         };
55
56         map<device_ptr, PixelMem> pixel_mem_map;
57
58         CUdeviceptr cuda_device_ptr(device_ptr mem)
59         {
60                 return (CUdeviceptr)mem;
61         }
62
63         const char *cuda_error_string(CUresult result)
64         {
65                 switch(result) {
66                         case CUDA_SUCCESS: return "No errors";
67                         case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
68                         case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
69                         case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
70                         case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
71
72                         case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
73                         case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
74
75                         case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
76                         case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
77                         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
78                         case CUDA_ERROR_MAP_FAILED: return "Map failed";
79                         case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
80                         case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
81                         case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
82                         case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
83                         case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
84                         case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
85                         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
86                         case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
87                         case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
88                         case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
89
90                         case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
91                         case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
92                         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
93                         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
94
95                         case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
96
97                         case CUDA_ERROR_NOT_FOUND: return "Not found";
98
99                         case CUDA_ERROR_NOT_READY: return "CUDA not ready";
100
101                         case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
102                         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
103                         case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
104                         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
105
106                         case CUDA_ERROR_UNKNOWN: return "Unknown error";
107
108                         default: return "Unknown CUDA error value";
109                 }
110         }
111
112 #ifdef NDEBUG
113 #define cuda_abort()
114 #else
115 #define cuda_abort() abort()
116 #endif
117
118 #define cuda_assert(stmt) \
119         { \
120                 CUresult result = stmt; \
121                 \
122                 if(result != CUDA_SUCCESS) { \
123                         string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
124                         if(error_msg == "") \
125                                 error_msg = message; \
126                         fprintf(stderr, "%s\n", message.c_str()); \
127                         cuda_abort(); \
128                 } \
129         }
130
131         bool cuda_error(CUresult result)
132         {
133                 if(result == CUDA_SUCCESS)
134                         return false;
135
136                 string message = string_printf("CUDA error: %s", cuda_error_string(result));
137                 if(error_msg == "")
138                         error_msg = message;
139                 fprintf(stderr, "%s\n", message.c_str());
140                 return true;
141         }
142
143         void cuda_error(const string& message)
144         {
145                 if(error_msg == "")
146                         error_msg = message;
147                 fprintf(stderr, "%s\n", message.c_str());
148         }
149
150         void cuda_push_context()
151         {
152                 cuda_assert(cuCtxSetCurrent(cuContext))
153         }
154
155         void cuda_pop_context()
156         {
157                 cuda_assert(cuCtxSetCurrent(NULL));
158         }
159
160         CUDADevice(DeviceInfo& info, bool background_)
161         {
162                 background = background_;
163
164                 cuDevId = info.num;
165                 cuDevice = 0;
166                 cuContext = 0;
167
168                 /* intialize */
169                 if(cuda_error(cuInit(0)))
170                         return;
171
172                 /* setup device and context */
173                 if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
174                         return;
175
176                 CUresult result;
177
178                 if(background) {
179                         result = cuCtxCreate(&cuContext, 0, cuDevice);
180                 }
181                 else {
182                         result = cuGLCtxCreate(&cuContext, 0, cuDevice);
183
184                         if(result != CUDA_SUCCESS) {
185                                 result = cuCtxCreate(&cuContext, 0, cuDevice);
186                                 background = true;
187                         }
188                 }
189
190                 if(cuda_error(result))
191                         return;
192
193                 cuda_pop_context();
194         }
195
196         ~CUDADevice()
197         {
198                 task_pool.stop();
199
200                 cuda_push_context();
201                 cuda_assert(cuCtxDetach(cuContext))
202         }
203
204         bool support_device(bool experimental)
205         {
206                 if(!experimental) {
207                         int major, minor;
208                         cuDeviceComputeCapability(&major, &minor, cuDevId);
209
210                         if(major <= 1 && minor <= 2) {
211                                 cuda_error(string_printf("CUDA device supported only with compute capability 1.3 or up, found %d.%d.", major, minor));
212                                 return false;
213                         }
214                 }
215
216                 return true;
217         }
218
219         string compile_kernel()
220         {
221                 /* compute cubin name */
222                 int major, minor;
223                 cuDeviceComputeCapability(&major, &minor, cuDevId);
224
225                 /* attempt to use kernel provided with blender */
226                 string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
227                 if(path_exists(cubin))
228                         return cubin;
229
230                 /* not found, try to use locally compiled kernel */
231                 string kernel_path = path_get("kernel");
232                 string md5 = path_files_md5_hash(kernel_path);
233
234                 cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
235                 cubin = path_user_get(path_join("cache", cubin));
236
237                 /* if exists already, use it */
238                 if(path_exists(cubin))
239                         return cubin;
240
241 #if defined(WITH_CUDA_BINARIES) && defined(_WIN32)
242                 if(major <= 1 && minor <= 2)
243                         cuda_error(string_printf("CUDA device supported only compute capability 1.3 or up, found %d.%d.", major, minor));
244                 else
245                         cuda_error(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
246                 return "";
247 #else
248                 /* if not, find CUDA compiler */
249                 string nvcc = cuCompilerPath();
250
251                 if(nvcc == "") {
252                         cuda_error("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
253                         return "";
254                 }
255
256                 /* compile */
257                 string kernel = path_join(kernel_path, "kernel.cu");
258                 string include = kernel_path;
259                 const int machine = system_cpu_bits();
260                 const int maxreg = 24;
261
262                 double starttime = time_dt();
263                 printf("Compiling CUDA kernel ...\n");
264
265                 path_create_directories(cubin);
266
267                 string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
268                         "-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
269                         nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
270
271                 if(system(command.c_str()) == -1) {
272                         cuda_error("Failed to execute compilation command, see console for details.");
273                         return "";
274                 }
275
276                 /* verify if compilation succeeded */
277                 if(!path_exists(cubin)) {
278                         cuda_error("CUDA kernel compilation failed, see console for details.");
279                         return "";
280                 }
281
282                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
283
284                 return cubin;
285 #endif
286         }
287
288         bool load_kernels(bool experimental)
289         {
290                 /* check if cuda init succeeded */
291                 if(cuContext == 0)
292                         return false;
293
294                 if(!support_device(experimental))
295                         return false;
296
297                 /* get kernel */
298                 string cubin = compile_kernel();
299
300                 if(cubin == "")
301                         return false;
302
303                 /* open module */
304                 cuda_push_context();
305
306                 CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
307                 if(cuda_error(result))
308                         cuda_error(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
309
310                 cuda_pop_context();
311
312                 return (result == CUDA_SUCCESS);
313         }
314
315         void mem_alloc(device_memory& mem, MemoryType type)
316         {
317                 cuda_push_context();
318                 CUdeviceptr device_pointer;
319                 cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
320                 mem.device_pointer = (device_ptr)device_pointer;
321                 cuda_pop_context();
322         }
323
324         void mem_copy_to(device_memory& mem)
325         {
326                 cuda_push_context();
327                 cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
328                 cuda_pop_context();
329         }
330
331         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
332         {
333                 size_t offset = elem*y*w;
334                 size_t size = elem*w*h;
335
336                 cuda_push_context();
337                 cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
338                         (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
339                 cuda_pop_context();
340         }
341
342         void mem_zero(device_memory& mem)
343         {
344                 memset((void*)mem.data_pointer, 0, mem.memory_size());
345
346                 cuda_push_context();
347                 cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
348                 cuda_pop_context();
349         }
350
351         void mem_free(device_memory& mem)
352         {
353                 if(mem.device_pointer) {
354                         cuda_push_context();
355                         cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
356                         cuda_pop_context();
357
358                         mem.device_pointer = 0;
359                 }
360         }
361
362         void const_copy_to(const char *name, void *host, size_t size)
363         {
364                 CUdeviceptr mem;
365                 size_t bytes;
366
367                 cuda_push_context();
368                 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
369                 //assert(bytes == size);
370                 cuda_assert(cuMemcpyHtoD(mem, host, size))
371                 cuda_pop_context();
372         }
373
374         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
375         {
376                 /* determine format */
377                 CUarray_format_enum format;
378                 size_t dsize = datatype_size(mem.data_type);
379                 size_t size = mem.memory_size();
380
381                 switch(mem.data_type) {
382                         case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
383                         case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
384                         case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
385                         case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
386                         default: assert(0); return;
387                 }
388
389                 CUtexref texref;
390
391                 cuda_push_context();
392                 cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
393
394                 if(interpolation) {
395                         CUarray handle;
396                         CUDA_ARRAY_DESCRIPTOR desc;
397
398                         desc.Width = mem.data_width;
399                         desc.Height = mem.data_height;
400                         desc.Format = format;
401                         desc.NumChannels = mem.data_elements;
402
403                         cuda_assert(cuArrayCreate(&handle, &desc))
404
405                         if(mem.data_height > 1) {
406                                 CUDA_MEMCPY2D param;
407                                 memset(&param, 0, sizeof(param));
408                                 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
409                                 param.dstArray = handle;
410                                 param.srcMemoryType = CU_MEMORYTYPE_HOST;
411                                 param.srcHost = (void*)mem.data_pointer;
412                                 param.srcPitch = mem.data_width*dsize*mem.data_elements;
413                                 param.WidthInBytes = param.srcPitch;
414                                 param.Height = mem.data_height;
415
416                                 cuda_assert(cuMemcpy2D(&param))
417                         }
418                         else
419                                 cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
420
421                         cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
422
423                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
424                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
425
426                         mem.device_pointer = (device_ptr)handle;
427                 }
428                 else {
429                         cuda_pop_context();
430
431                         mem_alloc(mem, MEM_READ_ONLY);
432                         mem_copy_to(mem);
433
434                         cuda_push_context();
435
436                         cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
437                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
438                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
439                 }
440
441                 if(periodic) {
442                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
443                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
444                 }
445                 else {
446                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
447                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
448                 }
449                 cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
450
451                 cuda_pop_context();
452
453                 tex_interp_map[mem.device_pointer] = interpolation;
454         }
455
456         void tex_free(device_memory& mem)
457         {
458                 if(mem.device_pointer) {
459                         if(tex_interp_map[mem.device_pointer]) {
460                                 cuda_push_context();
461                                 cuArrayDestroy((CUarray)mem.device_pointer);
462                                 cuda_pop_context();
463
464                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
465                                 mem.device_pointer = 0;
466                         }
467                         else {
468                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
469                                 mem_free(mem);
470                         }
471                 }
472         }
473
474         void path_trace(RenderTile& rtile, int sample)
475         {
476                 cuda_push_context();
477
478                 CUfunction cuPathTrace;
479                 CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
480                 CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
481
482                 /* get kernel function */
483                 cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
484                 
485                 /* pass in parameters */
486                 int offset = 0;
487                 
488                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
489                 offset += sizeof(d_buffer);
490
491                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
492                 offset += sizeof(d_rng_state);
493
494                 offset = align_up(offset, __alignof(sample));
495
496                 cuda_assert(cuParamSeti(cuPathTrace, offset, sample))
497                 offset += sizeof(sample);
498
499                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x))
500                 offset += sizeof(rtile.x);
501
502                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y))
503                 offset += sizeof(rtile.y);
504
505                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w))
506                 offset += sizeof(rtile.w);
507
508                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h))
509                 offset += sizeof(rtile.h);
510
511                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset))
512                 offset += sizeof(rtile.offset);
513
514                 cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride))
515                 offset += sizeof(rtile.stride);
516
517                 cuda_assert(cuParamSetSize(cuPathTrace, offset))
518
519                 /* launch kernel: todo find optimal size, cache config for fermi */
520 #ifndef __APPLE__
521                 int xthreads = 16;
522                 int ythreads = 16;
523 #else
524                 int xthreads = 8;
525                 int ythreads = 8;
526 #endif
527                 int xblocks = (rtile.w + xthreads - 1)/xthreads;
528                 int yblocks = (rtile.h + ythreads - 1)/ythreads;
529
530                 cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
531                 cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
532                 cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
533
534                 cuda_assert(cuCtxSynchronize())
535
536                 cuda_pop_context();
537         }
538
539         void tonemap(DeviceTask& task, device_ptr buffer, device_ptr rgba)
540         {
541                 cuda_push_context();
542
543                 CUfunction cuFilmConvert;
544                 CUdeviceptr d_rgba = map_pixels(rgba);
545                 CUdeviceptr d_buffer = cuda_device_ptr(buffer);
546
547                 /* get kernel function */
548                 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
549
550                 /* pass in parameters */
551                 int offset = 0;
552
553                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
554                 offset += sizeof(d_rgba);
555                 
556                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
557                 offset += sizeof(d_buffer);
558
559                 int sample = task.sample;
560                 offset = align_up(offset, __alignof(sample));
561
562                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.sample))
563                 offset += sizeof(task.sample);
564
565                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
566                 offset += sizeof(task.resolution);
567
568                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
569                 offset += sizeof(task.x);
570
571                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
572                 offset += sizeof(task.y);
573
574                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
575                 offset += sizeof(task.w);
576
577                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
578                 offset += sizeof(task.h);
579
580                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
581                 offset += sizeof(task.offset);
582
583                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
584                 offset += sizeof(task.stride);
585
586                 cuda_assert(cuParamSetSize(cuFilmConvert, offset))
587
588                 /* launch kernel: todo find optimal size, cache config for fermi */
589 #ifndef __APPLE__
590                 int xthreads = 16;
591                 int ythreads = 16;
592 #else
593                 int xthreads = 8;
594                 int ythreads = 8;
595 #endif
596                 int xblocks = (task.w + xthreads - 1)/xthreads;
597                 int yblocks = (task.h + ythreads - 1)/ythreads;
598
599                 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
600                 cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
601                 cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
602
603                 unmap_pixels(task.rgba);
604
605                 cuda_pop_context();
606         }
607
608         void shader(DeviceTask& task)
609         {
610                 cuda_push_context();
611
612                 CUfunction cuDisplace;
613                 CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
614                 CUdeviceptr d_offset = cuda_device_ptr(task.shader_output);
615
616                 /* get kernel function */
617                 cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
618                 
619                 /* pass in parameters */
620                 int offset = 0;
621                 
622                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
623                 offset += sizeof(d_input);
624
625                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
626                 offset += sizeof(d_offset);
627
628                 int shader_eval_type = task.shader_eval_type;
629                 offset = align_up(offset, __alignof(shader_eval_type));
630
631                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
632                 offset += sizeof(task.shader_eval_type);
633
634                 cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
635                 offset += sizeof(task.shader_x);
636
637                 cuda_assert(cuParamSetSize(cuDisplace, offset))
638
639                 /* launch kernel: todo find optimal size, cache config for fermi */
640 #ifndef __APPLE__
641                 int xthreads = 16;
642 #else
643                 int xthreads = 8;
644 #endif
645                 int xblocks = (task.shader_w + xthreads - 1)/xthreads;
646
647                 cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
648                 cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
649                 cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
650
651                 cuda_pop_context();
652         }
653
654         CUdeviceptr map_pixels(device_ptr mem)
655         {
656                 if(!background) {
657                         PixelMem pmem = pixel_mem_map[mem];
658                         CUdeviceptr buffer;
659                         
660                         size_t bytes;
661                         cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
662                         cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
663                         
664                         return buffer;
665                 }
666
667                 return cuda_device_ptr(mem);
668         }
669
670         void unmap_pixels(device_ptr mem)
671         {
672                 if(!background) {
673                         PixelMem pmem = pixel_mem_map[mem];
674
675                         cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
676                 }
677         }
678
679         void pixels_alloc(device_memory& mem)
680         {
681                 if(!background) {
682                         PixelMem pmem;
683
684                         pmem.w = mem.data_width;
685                         pmem.h = mem.data_height;
686
687                         cuda_push_context();
688
689                         glGenBuffers(1, &pmem.cuPBO);
690                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
691                         glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
692                         
693                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
694                         
695                         glGenTextures(1, &pmem.cuTexId);
696                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
697                         glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
698                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
699                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
700                         glBindTexture(GL_TEXTURE_2D, 0);
701                         
702                         CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
703
704                         if(!cuda_error(result)) {
705                                 cuda_pop_context();
706
707                                 mem.device_pointer = pmem.cuTexId;
708                                 pixel_mem_map[mem.device_pointer] = pmem;
709
710                                 return;
711                         }
712                         else {
713                                 /* failed to register buffer, fallback to no interop */
714                                 glDeleteBuffers(1, &pmem.cuPBO);
715                                 glDeleteTextures(1, &pmem.cuTexId);
716
717                                 cuda_pop_context();
718
719                                 background = true;
720                         }
721                 }
722
723                 Device::pixels_alloc(mem);
724         }
725
726         void pixels_copy_from(device_memory& mem, int y, int w, int h)
727         {
728                 if(!background) {
729                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
730
731                         cuda_push_context();
732
733                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
734                         uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
735                         size_t offset = sizeof(uchar)*4*y*w;
736                         memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
737                         glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
738                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
739
740                         cuda_pop_context();
741
742                         return;
743                 }
744
745                 Device::pixels_copy_from(mem, y, w, h);
746         }
747
748         void pixels_free(device_memory& mem)
749         {
750                 if(mem.device_pointer) {
751                         if(!background) {
752                                 PixelMem pmem = pixel_mem_map[mem.device_pointer];
753
754                                 cuda_push_context();
755
756                                 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
757                                 glDeleteBuffers(1, &pmem.cuPBO);
758                                 glDeleteTextures(1, &pmem.cuTexId);
759
760                                 cuda_pop_context();
761
762                                 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
763                                 mem.device_pointer = 0;
764
765                                 return;
766                         }
767
768                         Device::pixels_free(mem);
769                 }
770         }
771
772         void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
773         {
774                 if(!background) {
775                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
776
777                         cuda_push_context();
778
779                         /* for multi devices, this assumes the ineffecient method that we allocate
780                          * all pixels on the device even though we only render to a subset */
781                         size_t offset = sizeof(uint8_t)*4*y*w;
782
783                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
784                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
785                         glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
786                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
787                         
788                         glEnable(GL_TEXTURE_2D);
789                         
790                         if(transparent) {
791                                 glEnable(GL_BLEND);
792                                 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
793                         }
794
795                         glColor3f(1.0f, 1.0f, 1.0f);
796
797                         glPushMatrix();
798                         glTranslatef(0.0f, (float)dy, 0.0f);
799                                 
800                         glBegin(GL_QUADS);
801                         
802                         glTexCoord2f(0.0f, 0.0f);
803                         glVertex2f(0.0f, 0.0f);
804                         glTexCoord2f((float)w/(float)pmem.w, 0.0f);
805                         glVertex2f((float)width, 0.0f);
806                         glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
807                         glVertex2f((float)width, (float)height);
808                         glTexCoord2f(0.0f, (float)h/(float)pmem.h);
809                         glVertex2f(0.0f, (float)height);
810
811                         glEnd();
812
813                         glPopMatrix();
814
815                         if(transparent)
816                                 glDisable(GL_BLEND);
817                         
818                         glBindTexture(GL_TEXTURE_2D, 0);
819                         glDisable(GL_TEXTURE_2D);
820
821                         cuda_pop_context();
822
823                         return;
824                 }
825
826                 Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
827         }
828
829         void thread_run(DeviceTask *task)
830         {
831                 if(task->type == DeviceTask::PATH_TRACE) {
832                         RenderTile tile;
833                         
834                         /* keep rendering tiles until done */
835                         while(task->acquire_tile(this, tile)) {
836                                 int start_sample = tile.start_sample;
837                                 int end_sample = tile.start_sample + tile.num_samples;
838
839                                 for(int sample = start_sample; sample < end_sample; sample++) {
840                                         if (task->get_cancel())
841                                                 break;
842
843                                         path_trace(tile, sample);
844
845                                         tile.sample = sample + 1;
846
847                                         task->update_progress(tile);
848                                 }
849
850                                 task->release_tile(tile);
851                         }
852                 }
853                 else if(task->type == DeviceTask::SHADER) {
854                         shader(*task);
855
856                         cuda_push_context();
857                         cuda_assert(cuCtxSynchronize())
858                         cuda_pop_context();
859                 }
860         }
861
862         class CUDADeviceTask : public DeviceTask {
863         public:
864                 CUDADeviceTask(CUDADevice *device, DeviceTask& task)
865                 : DeviceTask(task)
866                 {
867                         run = function_bind(&CUDADevice::thread_run, device, this);
868                 }
869         };
870
871         void task_add(DeviceTask& task)
872         {
873                 if(task.type == DeviceTask::TONEMAP) {
874                         /* must be done in main thread due to opengl access */
875                         tonemap(task, task.buffer, task.rgba);
876
877                         cuda_push_context();
878                         cuda_assert(cuCtxSynchronize())
879                         cuda_pop_context();
880                 }
881                 else {
882                         task_pool.push(new CUDADeviceTask(this, task));
883                 }
884         }
885
886         void task_wait()
887         {
888                 task_pool.wait_work();
889         }
890
891         void task_cancel()
892         {
893                 task_pool.cancel();
894         }
895 };
896
897 Device *device_cuda_create(DeviceInfo& info, bool background)
898 {
899         return new CUDADevice(info, background);
900 }
901
902 void device_cuda_info(vector<DeviceInfo>& devices)
903 {
904         int count = 0;
905
906         if(cuInit(0) != CUDA_SUCCESS)
907                 return;
908         if(cuDeviceGetCount(&count) != CUDA_SUCCESS)
909                 return;
910         
911         vector<DeviceInfo> display_devices;
912         
913         for(int num = 0; num < count; num++) {
914                 char name[256];
915                 int attr;
916                 
917                 if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
918                         continue;
919
920                 DeviceInfo info;
921
922                 info.type = DEVICE_CUDA;
923                 info.description = string(name);
924                 info.id = string_printf("CUDA_%d", num);
925                 info.num = num;
926
927                 int major, minor;
928                 cuDeviceComputeCapability(&major, &minor, num);
929                 info.advanced_shading = (major >= 2);
930                 info.pack_images = false;
931
932                 /* if device has a kernel timeout, assume it is used for display */
933                 if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
934                         info.display_device = true;
935                         display_devices.push_back(info);
936                 }
937                 else
938                         devices.push_back(info);
939         }
940
941         if(!display_devices.empty())
942                 devices.insert(devices.end(), display_devices.begin(), display_devices.end());
943 }
944
945 CCL_NAMESPACE_END
946