9b7631674594ca23d9a09d520e8867b9369b6bd2
[blender.git] / intern / cycles / device / opencl / opencl.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/device.h"
20 #include "device/device_denoising.h"
21
22 #include "util/util_map.h"
23 #include "util/util_param.h"
24 #include "util/util_string.h"
25
26 #include "clew.h"
27
28 #include "device/opencl/memory_manager.h"
29
30 CCL_NAMESPACE_BEGIN
31
32 /* Disable workarounds, seems to be working fine on latest drivers. */
33 #define CYCLES_DISABLE_DRIVER_WORKAROUNDS
34
35 /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
36 #ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
37 /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
38 #  undef clEnqueueNDRangeKernel
39 #  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
40         CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
41         clFinish(a);
42
43 #  undef clEnqueueWriteBuffer
44 #  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
45         CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
46         clFinish(a);
47
48 #  undef clEnqueueReadBuffer
49 #  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
50         CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
51         clFinish(a);
52 #endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
53
54 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
55
56 struct OpenCLPlatformDevice {
57         OpenCLPlatformDevice(cl_platform_id platform_id,
58                              const string& platform_name,
59                              cl_device_id device_id,
60                              cl_device_type device_type,
61                              const string& device_name,
62                              const string& hardware_id,
63                                  const string& device_extensions)
64           : platform_id(platform_id),
65             platform_name(platform_name),
66             device_id(device_id),
67             device_type(device_type),
68             device_name(device_name),
69             hardware_id(hardware_id),
70             device_extensions(device_extensions) {}
71         cl_platform_id platform_id;
72         string platform_name;
73         cl_device_id device_id;
74         cl_device_type device_type;
75         string device_name;
76         string hardware_id;
77         string device_extensions;
78 };
79
80 /* Contains all static OpenCL helper functions. */
81 class OpenCLInfo
82 {
83 public:
84         static cl_device_type device_type();
85         static bool use_debug();
86         static bool kernel_use_advanced_shading(const string& platform_name);
87         static bool kernel_use_split(const string& platform_name,
88                                      const cl_device_type device_type);
89         static bool device_supported(const string& platform_name,
90                                      const cl_device_id device_id);
91         static bool platform_version_check(cl_platform_id platform,
92                                            string *error = NULL);
93         static bool device_version_check(cl_device_id device,
94                                          string *error = NULL);
95         static string get_hardware_id(const string& platform_name,
96                                       cl_device_id device_id);
97         static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
98                                        bool force_all = false);
99         static bool use_single_program();
100
101         /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
102
103         /* Platform information. */
104         static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
105         static cl_uint get_num_platforms();
106
107         static bool get_platforms(vector<cl_platform_id> *platform_ids,
108                                   cl_int *error = NULL);
109         static vector<cl_platform_id> get_platforms();
110
111         static bool get_platform_name(cl_platform_id platform_id,
112                                       string *platform_name);
113         static string get_platform_name(cl_platform_id platform_id);
114
115         static bool get_num_platform_devices(cl_platform_id platform_id,
116                                              cl_device_type device_type,
117                                              cl_uint *num_devices,
118                                              cl_int *error = NULL);
119         static cl_uint get_num_platform_devices(cl_platform_id platform_id,
120                                                 cl_device_type device_type);
121
122         static bool get_platform_devices(cl_platform_id platform_id,
123                                          cl_device_type device_type,
124                                          vector<cl_device_id> *device_ids,
125                                          cl_int* error = NULL);
126         static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
127                                                          cl_device_type device_type);
128
129         /* Device information. */
130         static bool get_device_name(cl_device_id device_id,
131                                     string *device_name,
132                                     cl_int* error = NULL);
133
134         static string get_device_name(cl_device_id device_id);
135
136         static bool get_device_extensions(cl_device_id device_id,
137                                           string *device_extensions,
138                                           cl_int* error = NULL);
139
140         static string get_device_extensions(cl_device_id device_id);
141
142         static bool get_device_type(cl_device_id device_id,
143                                     cl_device_type *device_type,
144                                     cl_int* error = NULL);
145         static cl_device_type get_device_type(cl_device_id device_id);
146
147         static bool get_driver_version(cl_device_id device_id,
148                                        int *major,
149                                        int *minor,
150                                        cl_int* error = NULL);
151
152         static int mem_sub_ptr_alignment(cl_device_id device_id);
153
154         /* Get somewhat more readable device name.
155          * Main difference is AMD OpenCL here which only gives code name
156          * for the regular device name. This will give more sane device
157          * name using some extensions.
158          */
159         static string get_readable_device_name(cl_device_id device_id);
160 };
161
162 /* Thread safe cache for contexts and programs.
163  */
164 class OpenCLCache
165 {
166         struct Slot
167         {
168                 struct ProgramEntry
169                 {
170                         ProgramEntry();
171                         ProgramEntry(const ProgramEntry& rhs);
172                         ~ProgramEntry();
173                         cl_program program;
174                         thread_mutex *mutex;
175                 };
176
177                 Slot();
178                 Slot(const Slot& rhs);
179                 ~Slot();
180
181                 thread_mutex *context_mutex;
182                 cl_context context;
183                 typedef map<ustring, ProgramEntry> EntryMap;
184                 EntryMap programs;
185
186         };
187
188         /* key is combination of platform ID and device ID */
189         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
190
191         /* map of Slot objects */
192         typedef map<PlatformDevicePair, Slot> CacheMap;
193         CacheMap cache;
194
195         /* MD5 hash of the kernel source. */
196         string kernel_md5;
197
198         thread_mutex cache_lock;
199         thread_mutex kernel_md5_lock;
200
201         /* lazy instantiate */
202         static OpenCLCache& global_instance();
203
204 public:
205
206         enum ProgramName {
207                 OCL_DEV_BASE_PROGRAM,
208                 OCL_DEV_MEGAKERNEL_PROGRAM,
209         };
210
211         /* Lookup context in the cache. If this returns NULL, slot_locker
212          * will be holding a lock for the cache. slot_locker should refer to a
213          * default constructed thread_scoped_lock. */
214         static cl_context get_context(cl_platform_id platform,
215                                       cl_device_id device,
216                                       thread_scoped_lock& slot_locker);
217         /* Same as above. */
218         static cl_program get_program(cl_platform_id platform,
219                                       cl_device_id device,
220                                       ustring key,
221                                       thread_scoped_lock& slot_locker);
222
223         /* Store context in the cache. You MUST have tried to get the item before storing to it. */
224         static void store_context(cl_platform_id platform,
225                                   cl_device_id device,
226                                   cl_context context,
227                                   thread_scoped_lock& slot_locker);
228         /* Same as above. */
229         static void store_program(cl_platform_id platform,
230                                   cl_device_id device,
231                                   cl_program program,
232                                   ustring key,
233                                   thread_scoped_lock& slot_locker);
234
235         static string get_kernel_md5();
236 };
237
238 #define opencl_device_assert(device, stmt) \
239         { \
240                 cl_int err = stmt; \
241                 \
242                 if(err != CL_SUCCESS) { \
243                         string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
244                         if((device)->error_message() == "") \
245                                 (device)->set_error(message); \
246                         fprintf(stderr, "%s\n", message.c_str()); \
247                 } \
248         } (void) 0
249
250 #define opencl_assert(stmt) \
251         { \
252                 cl_int err = stmt; \
253                 \
254                 if(err != CL_SUCCESS) { \
255                         string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
256                         if(error_msg == "") \
257                                 error_msg = message; \
258                         fprintf(stderr, "%s\n", message.c_str()); \
259                 } \
260         } (void) 0
261
262 class OpenCLDeviceBase : public Device
263 {
264 public:
265         DedicatedTaskPool task_pool;
266         cl_context cxContext;
267         cl_command_queue cqCommandQueue;
268         cl_platform_id cpPlatform;
269         cl_device_id cdDevice;
270         cl_int ciErr;
271
272         class OpenCLProgram {
273         public:
274                 OpenCLProgram() : loaded(false), program(NULL), device(NULL) {}
275                 OpenCLProgram(OpenCLDeviceBase *device,
276                               const string& program_name,
277                               const string& kernel_name,
278                               const string& kernel_build_options,
279                               bool use_stdout = true);
280                 ~OpenCLProgram();
281
282                 void add_kernel(ustring name);
283                 void load();
284
285                 bool is_loaded() const { return loaded; }
286                 const string& get_log() const { return log; }
287                 void report_error();
288
289                 cl_kernel operator()();
290                 cl_kernel operator()(ustring name);
291
292                 void release();
293
294         private:
295                 bool build_kernel(const string *debug_src);
296                 bool compile_kernel(const string *debug_src);
297                 bool load_binary(const string& clbin, const string *debug_src = NULL);
298                 bool save_binary(const string& clbin);
299
300                 void add_log(const string& msg, bool is_debug);
301                 void add_error(const string& msg);
302
303                 bool loaded;
304                 cl_program program;
305                 OpenCLDeviceBase *device;
306
307                 /* Used for the OpenCLCache key. */
308                 string program_name;
309
310                 string kernel_file, kernel_build_options, device_md5;
311
312                 bool use_stdout;
313                 string log, error_msg;
314                 string compile_output;
315
316                 map<ustring, cl_kernel> kernels;
317         };
318
319         OpenCLProgram base_program, denoising_program;
320
321         typedef map<string, device_vector<uchar>*> ConstMemMap;
322         typedef map<string, device_ptr> MemMap;
323
324         ConstMemMap const_mem_map;
325         MemMap mem_map;
326         device_ptr null_mem;
327
328         bool device_initialized;
329         string platform_name;
330         string device_name;
331
332         bool opencl_error(cl_int err);
333         void opencl_error(const string& message);
334         void opencl_assert_err(cl_int err, const char* where);
335
336         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_);
337         ~OpenCLDeviceBase();
338
339         static void CL_CALLBACK context_notify_callback(const char *err_info,
340                 const void * /*private_info*/, size_t /*cb*/, void *user_data);
341
342         bool opencl_version_check();
343
344         string device_md5_hash(string kernel_custom_build_options = "");
345         bool load_kernels(const DeviceRequestedFeatures& requested_features);
346
347         /* Has to be implemented by the real device classes.
348          * The base device will then load all these programs. */
349         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
350                                   vector<OpenCLProgram*> &programs) = 0;
351
352         void mem_alloc(device_memory& mem);
353         void mem_copy_to(device_memory& mem);
354         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
355         void mem_zero(device_memory& mem);
356         void mem_free(device_memory& mem);
357
358         int mem_sub_ptr_alignment();
359
360         void const_copy_to(const char *name, void *host, size_t size);
361         void tex_alloc(device_memory& mem);
362         void tex_free(device_memory& mem);
363
364         size_t global_size_round_up(int group_size, int global_size);
365         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
366                             bool x_workgroups = false,
367                             size_t max_workgroup_size = -1);
368         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
369         void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
370
371         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
372         void shader(DeviceTask& task);
373
374         void denoise(RenderTile& tile, DenoisingTask& denoising);
375
376         class OpenCLDeviceTask : public DeviceTask {
377         public:
378                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
379                 : DeviceTask(task)
380                 {
381                         run = function_bind(&OpenCLDeviceBase::thread_run,
382                                             device,
383                                             this);
384                 }
385         };
386
387         int get_split_task_count(DeviceTask& /*task*/)
388         {
389                 return 1;
390         }
391
392         void task_add(DeviceTask& task)
393         {
394                 task_pool.push(new OpenCLDeviceTask(this, task));
395         }
396
397         void task_wait()
398         {
399                 task_pool.wait();
400         }
401
402         void task_cancel()
403         {
404                 task_pool.cancel();
405         }
406
407         virtual void thread_run(DeviceTask * /*task*/) = 0;
408
409         virtual bool is_split_kernel() = 0;
410
411 protected:
412         string kernel_build_options(const string *debug_src = NULL);
413
414         void mem_zero_kernel(device_ptr ptr, size_t size);
415
416         bool denoising_non_local_means(device_ptr image_ptr,
417                                        device_ptr guide_ptr,
418                                        device_ptr variance_ptr,
419                                        device_ptr out_ptr,
420                                        DenoisingTask *task);
421         bool denoising_construct_transform(DenoisingTask *task);
422         bool denoising_accumulate(device_ptr color_ptr,
423                                   device_ptr color_variance_ptr,
424                                   device_ptr scale_ptr,
425                                   int frame,
426                                   DenoisingTask *task);
427         bool denoising_solve(device_ptr output_ptr,
428                              DenoisingTask *task);
429         bool denoising_combine_halves(device_ptr a_ptr,
430                                       device_ptr b_ptr,
431                                       device_ptr mean_ptr,
432                                       device_ptr variance_ptr,
433                                       int r, int4 rect,
434                                       DenoisingTask *task);
435         bool denoising_divide_shadow(device_ptr a_ptr,
436                                      device_ptr b_ptr,
437                                      device_ptr sample_variance_ptr,
438                                      device_ptr sv_variance_ptr,
439                                      device_ptr buffer_variance_ptr,
440                                      DenoisingTask *task);
441         bool denoising_get_feature(int mean_offset,
442                                    int variance_offset,
443                                    device_ptr mean_ptr,
444                                    device_ptr variance_ptr,
445                                    float scale,
446                                    DenoisingTask *task);
447         bool denoising_write_feature(int to_offset,
448                                      device_ptr from_ptr,
449                                      device_ptr buffer_ptr,
450                                      DenoisingTask *task);
451         bool denoising_detect_outliers(device_ptr image_ptr,
452                                        device_ptr variance_ptr,
453                                        device_ptr depth_ptr,
454                                        device_ptr output_ptr,
455                                        DenoisingTask *task);
456
457         device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size);
458         void mem_free_sub_ptr(device_ptr ptr);
459
460         class ArgumentWrapper {
461         public:
462                 ArgumentWrapper() : size(0), pointer(NULL)
463                 {
464                 }
465
466                 ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
467                                                            pointer((void*)(&argument.device_pointer))
468                 {
469                 }
470
471                 template<typename T>
472                 ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
473                                                               pointer((void*)(&argument.device_pointer))
474                 {
475                 }
476
477                 template<typename T>
478                 ArgumentWrapper(device_only_memory<T>& argument) : size(sizeof(void*)),
479                                                                    pointer((void*)(&argument.device_pointer))
480                 {
481                 }
482                 template<typename T>
483                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
484                                                pointer(&argument)
485                 {
486                 }
487
488                 ArgumentWrapper(int argument) : size(sizeof(int)),
489                                                 int_value(argument),
490                                                 pointer(&int_value)
491                 {
492                 }
493
494                 ArgumentWrapper(float argument) : size(sizeof(float)),
495                                                   float_value(argument),
496                                                   pointer(&float_value)
497                 {
498                 }
499
500                 size_t size;
501                 int int_value;
502                 float float_value;
503                 void *pointer;
504         };
505
506         /* TODO(sergey): In the future we can use variadic templates, once
507          * C++0x is allowed. Should allow to clean this up a bit.
508          */
509         int kernel_set_args(cl_kernel kernel,
510                             int start_argument_index,
511                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
512                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
513                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
514                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
515                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
516                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
517                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
518                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
519                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
520                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
521                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
522                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
523                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
524                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
525                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
526                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
527                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
528                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
529                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
530                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
531                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
532                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
533                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
534                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
535                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
536                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
537                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
538                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
539                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
540                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
541                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
542                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
543                             const ArgumentWrapper& arg33 = ArgumentWrapper());
544
545         void release_kernel_safe(cl_kernel kernel);
546         void release_mem_object_safe(cl_mem mem);
547         void release_program_safe(cl_program program);
548
549         /* ** Those guys are for workign around some compiler-specific bugs ** */
550
551         virtual cl_program load_cached_kernel(
552                 ustring key,
553                 thread_scoped_lock& cache_locker);
554
555         virtual void store_cached_kernel(
556                 cl_program program,
557                 ustring key,
558                 thread_scoped_lock& cache_locker);
559
560         virtual string build_options_for_base_program(
561                 const DeviceRequestedFeatures& /*requested_features*/);
562
563 private:
564         MemoryManager memory_manager;
565         friend class MemoryManager;
566
567         static_assert_align(TextureInfo, 16);
568         device_vector<TextureInfo> texture_info;
569
570         typedef map<string, device_memory*> TexturesMap;
571         TexturesMap textures;
572
573         bool textures_need_update;
574
575 protected:
576         void flush_texture_buffers();
577 };
578
579 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background);
580 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background);
581
582 CCL_NAMESPACE_END
583
584 #endif