2 * Copyright 2011, Blender Foundation.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software Foundation,
16 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26 #include "device_intern.h"
29 #include "util_math.h"
31 #include "util_opencl.h"
32 #include "util_opengl.h"
33 #include "util_path.h"
34 #include "util_time.h"
38 #define CL_MEM_PTR(p) ((cl_mem)(unsigned long)(p))
40 class OpenCLDevice : public Device
44 cl_command_queue cqCommandQueue;
45 cl_platform_id cpPlatform;
46 cl_device_id cdDevice;
48 cl_kernel ckPathTraceKernel;
49 cl_kernel ckFilmConvertKernel;
51 map<string, device_vector<uchar>*> const_mem_map;
52 map<string, device_memory*> mem_map;
54 bool device_initialized;
56 const char *opencl_error_string(cl_int err)
59 case CL_SUCCESS: return "Success!";
60 case CL_DEVICE_NOT_FOUND: return "Device not found.";
61 case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
62 case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
63 case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
64 case CL_OUT_OF_RESOURCES: return "Out of resources";
65 case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
66 case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
67 case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
68 case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
69 case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
70 case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
71 case CL_MAP_FAILURE: return "Map failure";
72 case CL_INVALID_VALUE: return "Invalid value";
73 case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
74 case CL_INVALID_PLATFORM: return "Invalid platform";
75 case CL_INVALID_DEVICE: return "Invalid device";
76 case CL_INVALID_CONTEXT: return "Invalid context";
77 case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
78 case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
79 case CL_INVALID_HOST_PTR: return "Invalid host pointer";
80 case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
81 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
82 case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
83 case CL_INVALID_SAMPLER: return "Invalid sampler";
84 case CL_INVALID_BINARY: return "Invalid binary";
85 case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
86 case CL_INVALID_PROGRAM: return "Invalid program";
87 case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
88 case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
89 case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
90 case CL_INVALID_KERNEL: return "Invalid kernel";
91 case CL_INVALID_ARG_INDEX: return "Invalid argument index";
92 case CL_INVALID_ARG_VALUE: return "Invalid argument value";
93 case CL_INVALID_ARG_SIZE: return "Invalid argument size";
94 case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
95 case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
96 case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
97 case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
98 case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
99 case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
100 case CL_INVALID_EVENT: return "Invalid event";
101 case CL_INVALID_OPERATION: return "Invalid operation";
102 case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
103 case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
104 case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
105 default: return "Unknown";
109 bool opencl_error(cl_int err)
111 if(err != CL_SUCCESS) {
112 fprintf(stderr, "OpenCL error (%d): %s\n", err, opencl_error_string(err));
119 void opencl_assert(cl_int err)
121 if(err != CL_SUCCESS) {
122 fprintf(stderr, "OpenCL error (%d): %s\n", err, opencl_error_string(err));
129 OpenCLDevice(bool background_)
131 background = background_;
134 cqCommandQueue = NULL;
136 ckPathTraceKernel = NULL;
137 ckFilmConvertKernel = NULL;
139 device_initialized = false;
141 vector<cl_platform_id> platform_ids;
142 cl_uint num_platforms;
145 ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
146 if(opencl_error(ciErr))
149 if(num_platforms == 0) {
150 fprintf(stderr, "OpenCL: no platforms found.\n");
154 platform_ids.resize(num_platforms);
155 ciErr = clGetPlatformIDs(num_platforms, &platform_ids[0], NULL);
156 if(opencl_error(ciErr))
159 cpPlatform = platform_ids[0]; /* todo: pick specified platform && device */
161 ciErr = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, 1, &cdDevice, NULL);
162 if(opencl_error(ciErr))
165 cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr);
166 if(opencl_error(ciErr))
169 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
170 if(opencl_error(ciErr))
173 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
174 device_initialized = true;
177 bool opencl_version_check()
180 int major, minor, req_major = 1, req_minor = 1;
182 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
184 if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
185 fprintf(stderr, "OpenCL: failed to parse platform version string (%s).", version);
189 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
190 fprintf(stderr, "OpenCL: platform version 1.1 or later required, found %d.%d\n", major, minor);
194 clGetDeviceInfo(cdDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
196 if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
197 fprintf(stderr, "OpenCL: failed to parse OpenCL C version string (%s).", version);
201 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
202 fprintf(stderr, "OpenCL: C version 1.1 or later required, found %d.%d\n", major, minor);
206 /* we don't check CL_DEVICE_VERSION since for e.g. nvidia sm 1.3 cards this is
207 1.0 even if the language features are there, just limited shared memory */
212 bool load_binary(const string& kernel_path, const string& clbin)
214 /* read binary into memory */
215 vector<uint8_t> binary;
217 if(!path_read_binary(clbin, binary)) {
218 fprintf(stderr, "OpenCL failed to read cached binary %s.\n", clbin.c_str());
224 size_t size = binary.size();
225 const uint8_t *bytes = &binary[0];
227 cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
228 &size, &bytes, &status, &ciErr);
230 if(opencl_error(status) || opencl_error(ciErr)) {
231 fprintf(stderr, "OpenCL failed create program from cached binary %s.\n", clbin.c_str());
235 if(!build_kernel(kernel_path))
241 bool save_binary(const string& clbin)
244 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
249 vector<uint8_t> binary(size);
250 uint8_t *bytes = &binary[0];
252 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
254 if(!path_write_binary(clbin, binary)) {
255 fprintf(stderr, "OpenCL failed to write cached binary %s.\n", clbin.c_str());
262 bool build_kernel(const string& kernel_path)
264 string build_options = "";
266 build_options += "-I " + kernel_path + ""; /* todo: escape path */
267 build_options += " -cl-fast-relaxed-math -cl-strict-aliasing";
269 ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
271 if(ciErr != CL_SUCCESS) {
272 /* show build errors */
276 clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
278 build_log = new char[ret_val_size+1];
279 clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
281 build_log[ret_val_size] = '\0';
282 fprintf(stderr, "OpenCL build failed:\n %s\n", build_log);
292 bool compile_kernel(const string& kernel_path, const string& kernel_md5)
294 /* we compile kernels consisting of many files. unfortunately opencl
295 kernel caches do not seem to recognize changes in included files.
296 so we force recompile on changes by adding the md5 hash of all files */
297 string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
298 size_t source_len = source.size();
299 const char *source_str = source.c_str();
301 cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
303 if(opencl_error(ciErr))
306 double starttime = time_dt();
307 printf("Compiling OpenCL kernel ...\n");
309 if(!build_kernel(kernel_path))
312 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
317 string device_md5_hash()
320 char version[256], driver[256], name[256], vendor[256];
322 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
323 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
324 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
325 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
327 md5.append((uint8_t*)vendor, strlen(vendor));
328 md5.append((uint8_t*)version, strlen(version));
329 md5.append((uint8_t*)name, strlen(name));
330 md5.append((uint8_t*)driver, strlen(driver));
332 return md5.get_hex();
337 /* verify if device was initialized */
338 if(!device_initialized) {
339 fprintf(stderr, "OpenCL: failed to initialize device.\n");
343 /* verify we have right opencl version */
344 if(!opencl_version_check())
347 /* md5 hash to detect changes */
348 string kernel_path = path_get("kernel");
349 string kernel_md5 = path_files_md5_hash(kernel_path);
350 string device_md5 = device_md5_hash();
352 /* try to use cache binary */
353 string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());;
354 clbin = path_user_get(path_join("cache", clbin));
356 if(path_exists(clbin)) {
357 /* if exists already, try use it */
358 if(!load_binary(kernel_path, clbin))
363 if(!compile_kernel(kernel_path, kernel_md5))
366 /* save binary for reuse */
371 ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
372 if(opencl_error(ciErr))
375 ckFilmConvertKernel = clCreateKernel(cpProgram, "kernel_ocl_tonemap", &ciErr);
376 if(opencl_error(ciErr))
385 clReleaseMemObject(CL_MEM_PTR(null_mem));
387 map<string, device_vector<uchar>*>::iterator mt;
388 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
389 mem_free(*(mt->second));
393 if(ckPathTraceKernel)
394 clReleaseKernel(ckPathTraceKernel);
395 if(ckFilmConvertKernel)
396 clReleaseKernel(ckFilmConvertKernel);
398 clReleaseProgram(cpProgram);
400 clReleaseCommandQueue(cqCommandQueue);
402 clReleaseContext(cxContext);
409 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
411 return string("OpenCL ") + name;
414 void mem_alloc(device_memory& mem, MemoryType type)
416 size_t size = mem.memory_size();
418 if(type == MEM_READ_ONLY)
419 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, size, NULL, &ciErr);
420 else if(type == MEM_WRITE_ONLY)
421 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_WRITE_ONLY, size, NULL, &ciErr);
423 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_WRITE, size, NULL, &ciErr);
425 opencl_assert(ciErr);
428 void mem_copy_to(device_memory& mem)
430 /* this is blocking */
431 size_t size = mem.memory_size();
432 ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL);
433 opencl_assert(ciErr);
436 void mem_copy_from(device_memory& mem, size_t offset, size_t size)
438 ciErr = clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL);
439 opencl_assert(ciErr);
442 void mem_zero(device_memory& mem)
444 if(mem.device_pointer) {
445 memset((void*)mem.data_pointer, 0, mem.memory_size());
450 void mem_free(device_memory& mem)
452 if(mem.device_pointer) {
453 ciErr = clReleaseMemObject(CL_MEM_PTR(mem.device_pointer));
454 mem.device_pointer = 0;
455 opencl_assert(ciErr);
459 void const_copy_to(const char *name, void *host, size_t size)
461 if(const_mem_map.find(name) == const_mem_map.end()) {
462 device_vector<uchar> *data = new device_vector<uchar>();
463 data->copy((uchar*)host, size);
465 mem_alloc(*data, MEM_READ_ONLY);
466 const_mem_map[name] = data;
469 device_vector<uchar> *data = const_mem_map[name];
470 data->copy((uchar*)host, size);
473 mem_copy_to(*const_mem_map[name]);
476 void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
478 mem_alloc(mem, MEM_READ_ONLY);
480 mem_map[name] = &mem;
483 void tex_free(device_memory& mem)
489 size_t global_size_round_up(int group_size, int global_size)
491 int r = global_size % group_size;
492 return global_size + ((r == 0)? 0: group_size - r);
495 void path_trace(DeviceTask& task)
497 /* cast arguments to cl types */
498 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
499 cl_mem d_buffer = CL_MEM_PTR(task.buffer);
500 cl_mem d_rng_state = CL_MEM_PTR(task.rng_state);
505 cl_int d_sample = task.sample;
507 /* sample arguments */
511 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data);
512 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
513 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state);
515 #define KERNEL_TEX(type, ttype, name) \
516 ciErr |= set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
517 #include "kernel_textures.h"
519 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample);
520 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x);
521 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y);
522 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w);
523 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h);
525 opencl_assert(ciErr);
527 size_t workgroup_size;
529 clGetKernelWorkGroupInfo(ckPathTraceKernel, cdDevice,
530 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
532 workgroup_size = max(sqrt((double)workgroup_size), 1.0);
534 size_t local_size[2] = {workgroup_size, workgroup_size};
535 size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)};
538 ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckPathTraceKernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
539 opencl_assert(ciErr);
540 opencl_assert(clFinish(cqCommandQueue));
543 cl_int set_kernel_arg_mem(cl_kernel kernel, int *narg, const char *name)
546 cl_int size, err = 0;
548 if(mem_map.find(name) != mem_map.end()) {
549 device_memory *mem = mem_map[name];
551 ptr = CL_MEM_PTR(mem->device_pointer);
552 size = mem->data_width;
555 /* work around NULL not working, even though the spec says otherwise */
556 ptr = CL_MEM_PTR(null_mem);
560 err |= clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr);
562 err |= clSetKernelArg(kernel, (*narg)++, sizeof(size), (void*)&size);
568 void tonemap(DeviceTask& task)
570 /* cast arguments to cl types */
571 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
572 cl_mem d_rgba = CL_MEM_PTR(task.rgba);
573 cl_mem d_buffer = CL_MEM_PTR(task.buffer);
578 cl_int d_sample = task.sample;
579 cl_int d_resolution = task.resolution;
581 /* sample arguments */
585 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data);
586 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba);
587 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
589 #define KERNEL_TEX(type, ttype, name) \
590 ciErr |= set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
591 #include "kernel_textures.h"
593 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample), (void*)&d_sample);
594 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_resolution), (void*)&d_resolution);
595 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x);
596 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y);
597 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w);
598 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h);
600 opencl_assert(ciErr);
602 size_t workgroup_size;
604 clGetKernelWorkGroupInfo(ckFilmConvertKernel, cdDevice,
605 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
607 workgroup_size = max(sqrt((double)workgroup_size), 1.0);
609 size_t local_size[2] = {workgroup_size, workgroup_size};
610 size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)};
613 ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckFilmConvertKernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
614 opencl_assert(ciErr);
615 opencl_assert(clFinish(cqCommandQueue));
618 void task_add(DeviceTask& task)
620 if(task.type == DeviceTask::TONEMAP)
622 else if(task.type == DeviceTask::PATH_TRACE)
635 Device *device_opencl_create(bool background)
637 return new OpenCLDevice(background);
642 #endif /* WITH_OPENCL */