76692ba86571626d409286b82d669b05c28f1dac
[blender.git] / intern / cycles / device / device_cuda.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "device.h"
24 #include "device_intern.h"
25
26 #include "util_cuda.h"
27 #include "util_debug.h"
28 #include "util_map.h"
29 #include "util_opengl.h"
30 #include "util_path.h"
31 #include "util_types.h"
32
33 CCL_NAMESPACE_BEGIN
34
35 class CUDADevice : public Device
36 {
37 public:
38         CUdevice cuDevice;
39         CUcontext cuContext;
40         CUmodule cuModule;
41         map<device_ptr, bool> tex_interp_map;
42         int cuDevId;
43
44         struct PixelMem {
45                 GLuint cuPBO;
46                 CUgraphicsResource cuPBOresource;
47                 GLuint cuTexId;
48                 int w, h;
49         };
50
51         map<device_ptr, PixelMem> pixel_mem_map;
52
53         CUdeviceptr cuda_device_ptr(device_ptr mem)
54         {
55                 return (CUdeviceptr)mem;
56         }
57
58         const char *cuda_error_string(CUresult result)
59         {
60                 switch(result) {
61                         case CUDA_SUCCESS: return "No errors";
62                         case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
63                         case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
64                         case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
65                         case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
66
67                         case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
68                         case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
69
70                         case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
71                         case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
72                         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
73                         case CUDA_ERROR_MAP_FAILED: return "Map failed";
74                         case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
75                         case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
76                         case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
77                         case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
78                         case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
79                         case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
80                         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
81                         case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
82                         case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
83                         case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
84
85                         case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
86                         case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
87                         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
88                         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
89
90                         case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
91
92                         case CUDA_ERROR_NOT_FOUND: return "Not found";
93
94                         case CUDA_ERROR_NOT_READY: return "CUDA not ready";
95
96                         case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
97                         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
98                         case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
99                         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
100
101                         case CUDA_ERROR_UNKNOWN: return "Unknown error";
102
103                         default: return "Unknown CUDA error value";
104                 }
105         }
106
107         static int cuda_align_up(int& offset, int alignment)
108         {
109                 return (offset + alignment - 1) & ~(alignment - 1);
110         }
111
112 #ifdef NDEBUG
113 #define cuda_abort()
114 #else
115 #define cuda_abort() abort()
116 #endif
117
118 #define cuda_assert(stmt) \
119         { \
120                 CUresult result = stmt; \
121                 \
122                 if(result != CUDA_SUCCESS) { \
123                         fprintf(stderr, "CUDA error: %s in %s\n", cuda_error_string(result), #stmt); \
124                         cuda_abort(); \
125                 } \
126         }
127
128         void cuda_push_context()
129         {
130                 cuda_assert(cuCtxSetCurrent(cuContext))
131         }
132
133         void cuda_pop_context()
134         {
135                 cuda_assert(cuCtxSetCurrent(NULL));
136         }
137
138         CUDADevice(bool background_)
139         {
140                 int major, minor;
141                 background = background_;
142
143                 cuDevId = 0;
144
145                 /* intialize */
146                 cuda_assert(cuInit(0))
147
148                 /* setup device and context */
149                 cuda_assert(cuDeviceGet(&cuDevice, cuDevId))
150
151                 if(background)
152                         cuda_assert(cuCtxCreate(&cuContext, 0, cuDevice))
153                 else
154                         cuda_assert(cuGLCtxCreate(&cuContext, 0, cuDevice))
155
156                 /* open module */
157                 cuDeviceComputeCapability(&major, &minor, cuDevId);
158                 string cubin = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
159                 cuda_assert(cuModuleLoad(&cuModule, path_get(cubin).c_str()))
160
161                 cuda_pop_context();
162         }
163
164         ~CUDADevice()
165         {
166                 cuda_push_context();
167                 cuda_assert(cuCtxDetach(cuContext))
168         }
169
170         string description()
171         {
172                 /* print device information */
173                 char deviceName[100];
174
175                 cuda_push_context();
176                 cuDeviceGetName(deviceName, 256, cuDevId);
177                 cuda_pop_context();
178
179                 return string("CUDA ") + deviceName;
180         }
181
182         void mem_alloc(device_memory& mem, MemoryType type)
183         {
184                 cuda_push_context();
185                 CUdeviceptr device_pointer;
186                 cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
187                 mem.device_pointer = (device_ptr)device_pointer;
188                 cuda_pop_context();
189         }
190
191         void mem_copy_to(device_memory& mem)
192         {
193                 cuda_push_context();
194                 cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
195                 cuda_pop_context();
196         }
197
198         void mem_copy_from(device_memory& mem, size_t offset, size_t size)
199         {
200                 /* todo: offset is ignored */
201                 cuda_push_context();
202                 cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
203                         (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
204                 cuda_pop_context();
205         }
206
207         void mem_zero(device_memory& mem)
208         {
209                 memset((void*)mem.data_pointer, 0, mem.memory_size());
210
211                 cuda_push_context();
212                 cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
213                 cuda_pop_context();
214         }
215
216         void mem_free(device_memory& mem)
217         {
218                 if(mem.device_pointer) {
219                         cuda_push_context();
220                         cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
221                         cuda_pop_context();
222
223                         mem.device_pointer = 0;
224                 }
225         }
226
227         void const_copy_to(const char *name, void *host, size_t size)
228         {
229                 CUdeviceptr mem;
230                 size_t bytes;
231
232                 cuda_push_context();
233                 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
234                 assert(bytes == size);
235                 cuda_assert(cuMemcpyHtoD(mem, host, size))
236                 cuda_pop_context();
237         }
238
239         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
240         {
241                 /* determine format */
242                 CUarray_format_enum format;
243                 size_t dsize = datatype_size(mem.data_type);
244                 size_t size = mem.memory_size();
245
246                 switch(mem.data_type) {
247                         case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
248                         case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
249                         case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
250                         case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
251                         default: assert(0); return;
252                 }
253
254                 CUtexref texref;
255
256                 cuda_push_context();
257                 cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
258
259                 if(interpolation) {
260                         CUarray handle;
261                         CUDA_ARRAY_DESCRIPTOR desc;
262
263                         desc.Width = mem.data_width;
264                         desc.Height = mem.data_height;
265                         desc.Format = format;
266                         desc.NumChannels = mem.data_elements;
267
268                         cuda_assert(cuArrayCreate(&handle, &desc))
269
270                         if(mem.data_height > 1) {
271                                 CUDA_MEMCPY2D param;
272                                 memset(&param, 0, sizeof(param));
273                                 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
274                                 param.dstArray = handle;
275                                 param.srcMemoryType = CU_MEMORYTYPE_HOST;
276                                 param.srcHost = (void*)mem.data_pointer;
277                                 param.srcPitch = mem.data_width*dsize*mem.data_elements;
278                                 param.WidthInBytes = param.srcPitch;
279                                 param.Height = mem.data_height;
280
281                                 cuda_assert(cuMemcpy2D(&param))
282                         }
283                         else
284                                 cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
285
286                         cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
287
288                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
289                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
290
291                         mem.device_pointer = (device_ptr)handle;
292                 }
293                 else {
294                         cuda_pop_context();
295
296                         mem_alloc(mem, MEM_READ_ONLY);
297                         mem_copy_to(mem);
298
299                         cuda_push_context();
300
301                         cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
302                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
303                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
304                 }
305
306                 if(periodic) {
307                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
308                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
309                 }
310                 else {
311                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
312                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
313                 }
314                 cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
315
316                 cuda_pop_context();
317
318                 tex_interp_map[mem.device_pointer] = interpolation;
319         }
320
321         void tex_free(device_memory& mem)
322         {
323                 if(mem.device_pointer) {
324                         if(tex_interp_map[mem.device_pointer]) {
325                                 cuda_push_context();
326                                 cuArrayDestroy((CUarray)mem.device_pointer);
327                                 cuda_pop_context();
328
329                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
330                                 mem.device_pointer = 0;
331                         }
332                         else {
333                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
334                                 mem_free(mem);
335                         }
336                 }
337         }
338
339         void path_trace(DeviceTask& task)
340         {
341                 cuda_push_context();
342
343                 CUfunction cuPathTrace;
344                 CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
345                 CUdeviceptr d_rng_state = cuda_device_ptr(task.rng_state);
346
347                 /* get kernel function */
348                 cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
349                 
350                 /* pass in parameters */
351                 int offset = 0;
352                 
353                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
354                 offset += sizeof(d_buffer);
355
356                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
357                 offset += sizeof(d_rng_state);
358
359                 offset = cuda_align_up(offset, __alignof(task.pass));
360
361                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.pass))
362                 offset += sizeof(task.pass);
363
364                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.x))
365                 offset += sizeof(task.x);
366
367                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.y))
368                 offset += sizeof(task.y);
369
370                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.w))
371                 offset += sizeof(task.w);
372
373                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.h))
374                 offset += sizeof(task.h);
375
376                 cuda_assert(cuParamSetSize(cuPathTrace, offset))
377
378                 /* launch kernel: todo find optimal size, cache config for fermi */
379 #ifndef __APPLE__
380                 int xthreads = 16;
381                 int ythreads = 16;
382 #else
383                 int xthreads = 8;
384                 int ythreads = 8;
385 #endif
386                 int xblocks = (task.w + xthreads - 1)/xthreads;
387                 int yblocks = (task.h + ythreads - 1)/ythreads;
388
389                 cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
390                 cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
391                 cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
392
393                 cuda_pop_context();
394         }
395
396         void tonemap(DeviceTask& task)
397         {
398                 cuda_push_context();
399
400                 CUfunction cuFilmConvert;
401                 CUdeviceptr d_rgba = map_pixels(task.rgba);
402                 CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
403
404                 /* get kernel function */
405                 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
406
407                 /* pass in parameters */
408                 int offset = 0;
409
410                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
411                 offset += sizeof(d_rgba);
412                 
413                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
414                 offset += sizeof(d_buffer);
415
416                 offset = cuda_align_up(offset, __alignof(task.pass));
417
418                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.pass))
419                 offset += sizeof(task.pass);
420
421                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
422                 offset += sizeof(task.resolution);
423
424                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
425                 offset += sizeof(task.x);
426
427                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
428                 offset += sizeof(task.y);
429
430                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
431                 offset += sizeof(task.w);
432
433                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
434                 offset += sizeof(task.h);
435
436                 cuda_assert(cuParamSetSize(cuFilmConvert, offset))
437
438                 /* launch kernel: todo find optimal size, cache config for fermi */
439 #ifndef __APPLE__
440                 int xthreads = 16;
441                 int ythreads = 16;
442 #else
443                 int xthreads = 8;
444                 int ythreads = 8;
445 #endif
446                 int xblocks = (task.w + xthreads - 1)/xthreads;
447                 int yblocks = (task.h + ythreads - 1)/ythreads;
448
449                 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
450                 cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
451                 cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
452
453                 unmap_pixels(task.rgba);
454
455                 cuda_pop_context();
456         }
457
458         void displace(DeviceTask& task)
459         {
460                 cuda_push_context();
461
462                 CUfunction cuDisplace;
463                 CUdeviceptr d_input = cuda_device_ptr(task.displace_input);
464                 CUdeviceptr d_offset = cuda_device_ptr(task.displace_offset);
465
466                 /* get kernel function */
467                 cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_displace"))
468                 
469                 /* pass in parameters */
470                 int offset = 0;
471                 
472                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
473                 offset += sizeof(d_input);
474
475                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
476                 offset += sizeof(d_offset);
477
478                 offset = cuda_align_up(offset, __alignof(task.displace_x));
479
480                 cuda_assert(cuParamSeti(cuDisplace, offset, task.displace_x))
481                 offset += sizeof(task.displace_x);
482
483                 cuda_assert(cuParamSetSize(cuDisplace, offset))
484
485                 /* launch kernel: todo find optimal size, cache config for fermi */
486 #ifndef __APPLE__
487                 int xthreads = 16;
488 #else
489                 int xthreads = 8;
490 #endif
491                 int xblocks = (task.displace_w + xthreads - 1)/xthreads;
492
493                 cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
494                 cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
495                 cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
496
497                 cuda_pop_context();
498         }
499
500         CUdeviceptr map_pixels(device_ptr mem)
501         {
502                 if(!background) {
503                         PixelMem pmem = pixel_mem_map[mem];
504                         CUdeviceptr buffer;
505                         
506                         size_t bytes;
507                         cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
508                         cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
509                         
510                         return buffer;
511                 }
512
513                 return cuda_device_ptr(mem);
514         }
515
516         void unmap_pixels(device_ptr mem)
517         {
518                 if(!background) {
519                         PixelMem pmem = pixel_mem_map[mem];
520
521                         cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
522                 }
523         }
524
525         void pixels_alloc(device_memory& mem)
526         {
527                 if(!background) {
528                         PixelMem pmem;
529
530                         pmem.w = mem.data_width;
531                         pmem.h = mem.data_height;
532
533                         cuda_push_context();
534
535                         glGenBuffers(1, &pmem.cuPBO);
536                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
537                         glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
538                         
539                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
540                         
541                         glGenTextures(1, &pmem.cuTexId);
542                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
543                         glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
544                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
545                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
546                         glBindTexture(GL_TEXTURE_2D, 0);
547                         
548                         cuda_assert(cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE))
549
550                         cuda_pop_context();
551
552                         mem.device_pointer = pmem.cuTexId;
553                         pixel_mem_map[mem.device_pointer] = pmem;
554
555                         return;
556                 }
557
558                 Device::pixels_alloc(mem);
559         }
560
561         void pixels_copy_from(device_memory& mem, int y, int w, int h)
562         {
563                 if(!background) {
564                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
565
566                         cuda_push_context();
567
568                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
569                         uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
570                         size_t offset = sizeof(uchar)*4*y*w;
571                         memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
572                         glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
573                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
574
575                         cuda_pop_context();
576
577                         return;
578                 }
579
580                 Device::pixels_copy_from(mem, y, w, h);
581         }
582
583         void pixels_free(device_memory& mem)
584         {
585                 if(mem.device_pointer) {
586                         if(!background) {
587                                 PixelMem pmem = pixel_mem_map[mem.device_pointer];
588
589                                 cuda_push_context();
590
591                                 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
592                                 glDeleteBuffers(1, &pmem.cuPBO);
593                                 glDeleteTextures(1, &pmem.cuTexId);
594
595                                 cuda_pop_context();
596
597                                 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
598                                 mem.device_pointer = 0;
599
600                                 return;
601                         }
602
603                         Device::pixels_free(mem);
604                 }
605         }
606
607         void draw_pixels(device_memory& mem, int y, int w, int h, int width, int height)
608         {
609                 if(!background) {
610                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
611
612                         cuda_push_context();
613
614                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
615                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
616                         glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, 0);
617                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
618                         
619                         glEnable(GL_TEXTURE_2D);
620                         
621                         glColor3f(1.0f, 1.0f, 1.0f);
622
623                         glPushMatrix();
624                         glTranslatef(0, y, 0.0f);
625                         
626                         glBegin(GL_QUADS);
627                         
628                         glTexCoord2f(0, 0);
629                         glVertex2f(0, 0);
630                         glTexCoord2f((float)w/(float)width, 0);
631                         glVertex2f(width, 0);
632                         glTexCoord2f((float)w/(float)width, (float)h/(float)height);
633                         glVertex2f(width, height);
634                         glTexCoord2f(0, (float)h/(float)height);
635                         glVertex2f(0, height);
636
637                         glEnd();
638
639                         glPopMatrix();
640                         
641                         glBindTexture(GL_TEXTURE_2D, 0);
642                         glDisable(GL_TEXTURE_2D);
643
644                         cuda_pop_context();
645
646                         return;
647                 }
648
649                 Device::draw_pixels(mem, y, w, h, width, height);
650         }
651
652         void task_add(DeviceTask& task)
653         {
654                 if(task.type == DeviceTask::TONEMAP)
655                         tonemap(task);
656                 else if(task.type == DeviceTask::PATH_TRACE)
657                         path_trace(task);
658                 else if(task.type == DeviceTask::DISPLACE)
659                         displace(task);
660         }
661
662         void task_wait()
663         {
664                 cuda_push_context();
665
666                 cuda_assert(cuCtxSynchronize())
667
668                 cuda_pop_context();
669         }
670
671         void task_cancel()
672         {
673         }
674 };
675
676 Device *device_cuda_create(bool background)
677 {
678         return new CUDADevice(background);
679 }
680
681 CCL_NAMESPACE_END
682