Cycles: first batch of windows build fixes, not quite there yet.
[blender.git] / intern / cycles / device / device_cuda.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "device.h"
24 #include "device_intern.h"
25
26 #include "util_cuda.h"
27 #include "util_debug.h"
28 #include "util_map.h"
29 #include "util_opengl.h"
30 #include "util_path.h"
31 #include "util_types.h"
32
33 CCL_NAMESPACE_BEGIN
34
35 class CUDADevice : public Device
36 {
37 public:
38         CUdevice cuDevice;
39         CUcontext cuContext;
40         CUmodule cuModule;
41         map<device_ptr, bool> tex_interp_map;
42         int cuDevId;
43
44         struct PixelMem {
45                 GLuint cuPBO;
46                 CUgraphicsResource cuPBOresource;
47                 GLuint cuTexId;
48                 int w, h;
49         };
50
51         map<device_ptr, PixelMem> pixel_mem_map;
52
53         CUdeviceptr cuda_device_ptr(device_ptr mem)
54         {
55                 return (CUdeviceptr)mem;
56         }
57
58         const char *cuda_error_string(CUresult result)
59         {
60                 switch(result) {
61                         case CUDA_SUCCESS: return "No errors";
62                         case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
63                         case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
64                         case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
65                         case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
66
67                         case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
68                         case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
69
70                         case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
71                         case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
72                         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
73                         case CUDA_ERROR_MAP_FAILED: return "Map failed";
74                         case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
75                         case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
76                         case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
77                         case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
78                         case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
79                         case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
80                         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
81                         case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
82                         case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
83                         case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
84
85                         case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
86                         case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
87                         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
88                         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
89
90                         case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
91
92                         case CUDA_ERROR_NOT_FOUND: return "Not found";
93
94                         case CUDA_ERROR_NOT_READY: return "CUDA not ready";
95
96                         case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
97                         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
98                         case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
99                         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
100
101                         case CUDA_ERROR_UNKNOWN: return "Unknown error";
102
103                         default: return "Unknown CUDA error value";
104                 }
105         }
106
107         static int cuda_align_up(int& offset, int alignment)
108         {
109                 return (offset + alignment - 1) & ~(alignment - 1);
110         }
111
112 #ifdef NDEBUG
113 #define cuda_abort()
114 #else
115 #define cuda_abort() abort()
116 #endif
117
118 #define cuda_assert(stmt) \
119         { \
120                 CUresult result = stmt; \
121                 \
122                 if(result != CUDA_SUCCESS) { \
123                         fprintf(stderr, "CUDA error: %s in %s\n", cuda_error_string(result), #stmt); \
124                         cuda_abort(); \
125                 } \
126         }
127
128         void cuda_push_context()
129         {
130                 cuda_assert(cuCtxSetCurrent(cuContext))
131         }
132
133         void cuda_pop_context()
134         {
135                 cuda_assert(cuCtxSetCurrent(NULL));
136         }
137
138         CUDADevice(bool background_)
139         {
140                 int major, minor;
141                 background = background_;
142
143                 cuDevId = 0;
144
145                 /* intialize */
146                 cuda_assert(cuInit(0))
147
148                 /* setup device and context */
149                 cuda_assert(cuDeviceGet(&cuDevice, cuDevId))
150
151                 if(background)
152                         cuda_assert(cuCtxCreate(&cuContext, 0, cuDevice))
153                 else
154                         cuda_assert(cuGLCtxCreate(&cuContext, 0, cuDevice))
155
156                 /* open module */
157                 cuDeviceComputeCapability(&major, &minor, cuDevId);
158                 string cubin = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
159                 cuda_assert(cuModuleLoad(&cuModule, path_get(cubin).c_str()))
160
161                 cuda_pop_context();
162         }
163
164         ~CUDADevice()
165         {
166                 cuda_push_context();
167                 cuda_assert(cuCtxDetach(cuContext))
168         }
169
170         string description()
171         {
172                 /* print device information */
173                 char deviceName[100];
174
175                 cuda_push_context();
176                 cuDeviceGetName(deviceName, 256, cuDevId);
177                 cuda_pop_context();
178
179                 return string("CUDA ") + deviceName;
180         }
181
182         void mem_alloc(device_memory& mem, MemoryType type)
183         {
184                 cuda_push_context();
185                 CUdeviceptr device_pointer;
186                 cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
187                 mem.device_pointer = (device_ptr)device_pointer;
188                 cuda_pop_context();
189         }
190
191         void mem_copy_to(device_memory& mem)
192         {
193                 cuda_push_context();
194                 cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
195                 cuda_pop_context();
196         }
197
198         void mem_copy_from(device_memory& mem, size_t offset, size_t size)
199         {
200                 /* todo: offset is ignored */
201                 cuda_push_context();
202                 cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
203                         (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
204                 cuda_pop_context();
205         }
206
207         void mem_zero(device_memory& mem)
208         {
209                 memset((void*)mem.data_pointer, 0, mem.memory_size());
210
211                 cuda_push_context();
212                 cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
213                 cuda_pop_context();
214         }
215
216         void mem_free(device_memory& mem)
217         {
218                 if(mem.device_pointer) {
219                         cuda_push_context();
220                         cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
221                         cuda_pop_context();
222
223                         mem.device_pointer = 0;
224                 }
225         }
226
227         void const_copy_to(const char *name, void *host, size_t size)
228         {
229                 CUdeviceptr mem;
230                 size_t bytes;
231
232                 cuda_push_context();
233                 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
234                 assert(bytes == size);
235                 cuda_assert(cuMemcpyHtoD(mem, host, size))
236                 cuda_pop_context();
237         }
238
239         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
240         {
241                 /* determine format */
242                 CUarray_format_enum format;
243                 size_t dsize = datatype_size(mem.data_type);
244                 size_t size = mem.memory_size();
245
246                 switch(mem.data_type) {
247                         case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
248                         case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
249                         case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
250                         case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
251                         default: assert(0); return;
252                 }
253
254                 CUtexref texref;
255
256                 cuda_push_context();
257                 cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
258
259                 if(interpolation) {
260                         CUarray handle;
261                         CUDA_ARRAY_DESCRIPTOR desc;
262
263                         desc.Width = mem.data_width;
264                         desc.Height = mem.data_height;
265                         desc.Format = format;
266                         desc.NumChannels = mem.data_elements;
267
268                         cuda_assert(cuArrayCreate(&handle, &desc))
269
270                         if(mem.data_height > 1) {
271                                 CUDA_MEMCPY2D param;
272                                 memset(&param, 0, sizeof(param));
273                                 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
274                                 param.dstArray = handle;
275                                 param.srcMemoryType = CU_MEMORYTYPE_HOST;
276                                 param.srcHost = (void*)mem.data_pointer;
277                                 param.srcPitch = mem.data_width*dsize*mem.data_elements;
278                                 param.WidthInBytes = param.srcPitch;
279                                 param.Height = mem.data_height;
280
281                                 cuda_assert(cuMemcpy2D(&param))
282                         }
283                         else
284                                 cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
285
286                         cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
287
288                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
289                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
290
291                         mem.device_pointer = (device_ptr)handle;
292                 }
293                 else {
294                         cuda_pop_context();
295
296                         mem_alloc(mem, MEM_READ_ONLY);
297                         mem_copy_to(mem);
298
299                         cuda_push_context();
300
301                         cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
302                         cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
303                         cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
304                 }
305
306                 if(periodic) {
307                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
308                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
309                 }
310                 else {
311                         cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
312                         cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
313                 }
314                 cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
315
316                 cuda_pop_context();
317
318                 tex_interp_map[mem.device_pointer] = interpolation;
319         }
320
321         void tex_free(device_memory& mem)
322         {
323                 if(mem.device_pointer) {
324                         if(tex_interp_map[mem.device_pointer]) {
325                                 cuda_push_context();
326                                 cuArrayDestroy((CUarray)mem.device_pointer);
327                                 cuda_pop_context();
328
329                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
330                                 mem.device_pointer = 0;
331                         }
332                         else {
333                                 tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
334                                 mem_free(mem);
335                         }
336                 }
337         }
338
339         void path_trace(DeviceTask& task)
340         {
341                 cuda_push_context();
342
343                 CUfunction cuPathTrace;
344                 CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
345                 CUdeviceptr d_rng_state = cuda_device_ptr(task.rng_state);
346
347                 /* get kernel function */
348                 cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
349                 
350                 /* pass in parameters */
351                 int offset = 0;
352                 
353                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
354                 offset += sizeof(d_buffer);
355
356                 cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
357                 offset += sizeof(d_rng_state);
358
359                 int pass = task.pass;
360                 offset = cuda_align_up(offset, __alignof(pass));
361
362                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.pass))
363                 offset += sizeof(task.pass);
364
365                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.x))
366                 offset += sizeof(task.x);
367
368                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.y))
369                 offset += sizeof(task.y);
370
371                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.w))
372                 offset += sizeof(task.w);
373
374                 cuda_assert(cuParamSeti(cuPathTrace, offset, task.h))
375                 offset += sizeof(task.h);
376
377                 cuda_assert(cuParamSetSize(cuPathTrace, offset))
378
379                 /* launch kernel: todo find optimal size, cache config for fermi */
380 #ifndef __APPLE__
381                 int xthreads = 16;
382                 int ythreads = 16;
383 #else
384                 int xthreads = 8;
385                 int ythreads = 8;
386 #endif
387                 int xblocks = (task.w + xthreads - 1)/xthreads;
388                 int yblocks = (task.h + ythreads - 1)/ythreads;
389
390                 cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
391                 cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
392                 cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
393
394                 cuda_pop_context();
395         }
396
397         void tonemap(DeviceTask& task)
398         {
399                 cuda_push_context();
400
401                 CUfunction cuFilmConvert;
402                 CUdeviceptr d_rgba = map_pixels(task.rgba);
403                 CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
404
405                 /* get kernel function */
406                 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
407
408                 /* pass in parameters */
409                 int offset = 0;
410
411                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
412                 offset += sizeof(d_rgba);
413                 
414                 cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
415                 offset += sizeof(d_buffer);
416
417                 int pass = task.pass;
418                 offset = cuda_align_up(offset, __alignof(pass));
419
420                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.pass))
421                 offset += sizeof(task.pass);
422
423                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
424                 offset += sizeof(task.resolution);
425
426                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
427                 offset += sizeof(task.x);
428
429                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
430                 offset += sizeof(task.y);
431
432                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
433                 offset += sizeof(task.w);
434
435                 cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
436                 offset += sizeof(task.h);
437
438                 cuda_assert(cuParamSetSize(cuFilmConvert, offset))
439
440                 /* launch kernel: todo find optimal size, cache config for fermi */
441 #ifndef __APPLE__
442                 int xthreads = 16;
443                 int ythreads = 16;
444 #else
445                 int xthreads = 8;
446                 int ythreads = 8;
447 #endif
448                 int xblocks = (task.w + xthreads - 1)/xthreads;
449                 int yblocks = (task.h + ythreads - 1)/ythreads;
450
451                 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
452                 cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
453                 cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
454
455                 unmap_pixels(task.rgba);
456
457                 cuda_pop_context();
458         }
459
460         void displace(DeviceTask& task)
461         {
462                 cuda_push_context();
463
464                 CUfunction cuDisplace;
465                 CUdeviceptr d_input = cuda_device_ptr(task.displace_input);
466                 CUdeviceptr d_offset = cuda_device_ptr(task.displace_offset);
467
468                 /* get kernel function */
469                 cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_displace"))
470                 
471                 /* pass in parameters */
472                 int offset = 0;
473                 
474                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
475                 offset += sizeof(d_input);
476
477                 cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
478                 offset += sizeof(d_offset);
479
480                 int displace_x = task.displace_x;
481                 offset = cuda_align_up(offset, __alignof(displace_x));
482
483                 cuda_assert(cuParamSeti(cuDisplace, offset, task.displace_x))
484                 offset += sizeof(task.displace_x);
485
486                 cuda_assert(cuParamSetSize(cuDisplace, offset))
487
488                 /* launch kernel: todo find optimal size, cache config for fermi */
489 #ifndef __APPLE__
490                 int xthreads = 16;
491 #else
492                 int xthreads = 8;
493 #endif
494                 int xblocks = (task.displace_w + xthreads - 1)/xthreads;
495
496                 cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
497                 cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
498                 cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
499
500                 cuda_pop_context();
501         }
502
503         CUdeviceptr map_pixels(device_ptr mem)
504         {
505                 if(!background) {
506                         PixelMem pmem = pixel_mem_map[mem];
507                         CUdeviceptr buffer;
508                         
509                         size_t bytes;
510                         cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
511                         cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
512                         
513                         return buffer;
514                 }
515
516                 return cuda_device_ptr(mem);
517         }
518
519         void unmap_pixels(device_ptr mem)
520         {
521                 if(!background) {
522                         PixelMem pmem = pixel_mem_map[mem];
523
524                         cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
525                 }
526         }
527
528         void pixels_alloc(device_memory& mem)
529         {
530                 if(!background) {
531                         PixelMem pmem;
532
533                         pmem.w = mem.data_width;
534                         pmem.h = mem.data_height;
535
536                         cuda_push_context();
537
538                         glGenBuffers(1, &pmem.cuPBO);
539                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
540                         glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
541                         
542                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
543                         
544                         glGenTextures(1, &pmem.cuTexId);
545                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
546                         glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
547                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
548                         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
549                         glBindTexture(GL_TEXTURE_2D, 0);
550                         
551                         cuda_assert(cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE))
552
553                         cuda_pop_context();
554
555                         mem.device_pointer = pmem.cuTexId;
556                         pixel_mem_map[mem.device_pointer] = pmem;
557
558                         return;
559                 }
560
561                 Device::pixels_alloc(mem);
562         }
563
564         void pixels_copy_from(device_memory& mem, int y, int w, int h)
565         {
566                 if(!background) {
567                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
568
569                         cuda_push_context();
570
571                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
572                         uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
573                         size_t offset = sizeof(uchar)*4*y*w;
574                         memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
575                         glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
576                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
577
578                         cuda_pop_context();
579
580                         return;
581                 }
582
583                 Device::pixels_copy_from(mem, y, w, h);
584         }
585
586         void pixels_free(device_memory& mem)
587         {
588                 if(mem.device_pointer) {
589                         if(!background) {
590                                 PixelMem pmem = pixel_mem_map[mem.device_pointer];
591
592                                 cuda_push_context();
593
594                                 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
595                                 glDeleteBuffers(1, &pmem.cuPBO);
596                                 glDeleteTextures(1, &pmem.cuTexId);
597
598                                 cuda_pop_context();
599
600                                 pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
601                                 mem.device_pointer = 0;
602
603                                 return;
604                         }
605
606                         Device::pixels_free(mem);
607                 }
608         }
609
610         void draw_pixels(device_memory& mem, int y, int w, int h, int width, int height)
611         {
612                 if(!background) {
613                         PixelMem pmem = pixel_mem_map[mem.device_pointer];
614
615                         cuda_push_context();
616
617                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
618                         glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
619                         glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, 0);
620                         glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
621                         
622                         glEnable(GL_TEXTURE_2D);
623                         
624                         glColor3f(1.0f, 1.0f, 1.0f);
625
626                         glPushMatrix();
627                         glTranslatef(0.0f, (float)y, 0.0f);
628                         
629                         glBegin(GL_QUADS);
630                         
631                         glTexCoord2f(0.0f, 0.0f);
632                         glVertex2f(0.0f, 0.0f);
633                         glTexCoord2f((float)w/(float)width, 0);
634                         glVertex2f((float)width, 0.0f);
635                         glTexCoord2f((float)w/(float)width, (float)h/(float)height);
636                         glVertex2f((float)width, (float)height);
637                         glTexCoord2f(0.0f, (float)h/(float)height);
638                         glVertex2f(0.0f, (float)height);
639
640                         glEnd();
641
642                         glPopMatrix();
643                         
644                         glBindTexture(GL_TEXTURE_2D, 0);
645                         glDisable(GL_TEXTURE_2D);
646
647                         cuda_pop_context();
648
649                         return;
650                 }
651
652                 Device::draw_pixels(mem, y, w, h, width, height);
653         }
654
655         void task_add(DeviceTask& task)
656         {
657                 if(task.type == DeviceTask::TONEMAP)
658                         tonemap(task);
659                 else if(task.type == DeviceTask::PATH_TRACE)
660                         path_trace(task);
661                 else if(task.type == DeviceTask::DISPLACE)
662                         displace(task);
663         }
664
665         void task_wait()
666         {
667                 cuda_push_context();
668
669                 cuda_assert(cuCtxSynchronize())
670
671                 cuda_pop_context();
672         }
673
674         void task_cancel()
675         {
676         }
677 };
678
679 Device *device_cuda_create(bool background)
680 {
681         return new CUDADevice(background);
682 }
683
684 CCL_NAMESPACE_END
685