Merge branch 'master' into blender2.8
[blender.git] / intern / cycles / device / opencl / opencl.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/device.h"
20 #include "device/device_denoising.h"
21
22 #include "util/util_map.h"
23 #include "util/util_param.h"
24 #include "util/util_string.h"
25
26 #include "clew.h"
27
28 CCL_NAMESPACE_BEGIN
29
30 /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
31 #ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
32 /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
33 #  undef clEnqueueNDRangeKernel
34 #  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
35         CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
36         clFinish(a);
37
38 #  undef clEnqueueWriteBuffer
39 #  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
40         CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
41         clFinish(a);
42
43 #  undef clEnqueueReadBuffer
44 #  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
45         CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
46         clFinish(a);
47 #endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
48
49 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
50
51 struct OpenCLPlatformDevice {
52         OpenCLPlatformDevice(cl_platform_id platform_id,
53                              const string& platform_name,
54                              cl_device_id device_id,
55                              cl_device_type device_type,
56                              const string& device_name,
57                              const string& hardware_id)
58           : platform_id(platform_id),
59             platform_name(platform_name),
60             device_id(device_id),
61             device_type(device_type),
62             device_name(device_name),
63             hardware_id(hardware_id) {}
64         cl_platform_id platform_id;
65         string platform_name;
66         cl_device_id device_id;
67         cl_device_type device_type;
68         string device_name;
69         string hardware_id;
70 };
71
72 /* Contains all static OpenCL helper functions. */
73 class OpenCLInfo
74 {
75 public:
76         static cl_device_type device_type();
77         static bool use_debug();
78         static bool kernel_use_advanced_shading(const string& platform_name);
79         static bool kernel_use_split(const string& platform_name,
80                                      const cl_device_type device_type);
81         static bool device_supported(const string& platform_name,
82                                      const cl_device_id device_id);
83         static bool platform_version_check(cl_platform_id platform,
84                                            string *error = NULL);
85         static bool device_version_check(cl_device_id device,
86                                          string *error = NULL);
87         static string get_hardware_id(const string& platform_name,
88                                       cl_device_id device_id);
89         static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
90                                        bool force_all = false);
91         static bool use_single_program();
92
93         /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
94
95         /* Platform information. */
96         static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
97         static cl_uint get_num_platforms();
98
99         static bool get_platforms(vector<cl_platform_id> *platform_ids,
100                                   cl_int *error = NULL);
101         static vector<cl_platform_id> get_platforms();
102
103         static bool get_platform_name(cl_platform_id platform_id,
104                                       string *platform_name);
105         static string get_platform_name(cl_platform_id platform_id);
106
107         static bool get_num_platform_devices(cl_platform_id platform_id,
108                                              cl_device_type device_type,
109                                              cl_uint *num_devices,
110                                              cl_int *error = NULL);
111         static cl_uint get_num_platform_devices(cl_platform_id platform_id,
112                                                 cl_device_type device_type);
113
114         static bool get_platform_devices(cl_platform_id platform_id,
115                                          cl_device_type device_type,
116                                          vector<cl_device_id> *device_ids,
117                                          cl_int* error = NULL);
118         static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
119                                                          cl_device_type device_type);
120
121         /* Device information. */
122         static bool get_device_name(cl_device_id device_id,
123                                     string *device_name,
124                                     cl_int* error = NULL);
125
126         static string get_device_name(cl_device_id device_id);
127
128         static bool get_device_type(cl_device_id device_id,
129                                     cl_device_type *device_type,
130                                     cl_int* error = NULL);
131         static cl_device_type get_device_type(cl_device_id device_id);
132
133         static bool get_driver_version(cl_device_id device_id,
134                                        int *major,
135                                        int *minor,
136                                        cl_int* error = NULL);
137
138         static int mem_address_alignment(cl_device_id device_id);
139
140         /* Get somewhat more readable device name.
141          * Main difference is AMD OpenCL here which only gives code name
142          * for the regular device name. This will give more sane device
143          * name using some extensions.
144          */
145         static string get_readable_device_name(cl_device_id device_id);
146 };
147
148 /* Thread safe cache for contexts and programs.
149  */
150 class OpenCLCache
151 {
152         struct Slot
153         {
154                 struct ProgramEntry
155                 {
156                         ProgramEntry();
157                         ProgramEntry(const ProgramEntry& rhs);
158                         ~ProgramEntry();
159                         cl_program program;
160                         thread_mutex *mutex;
161                 };
162
163                 Slot();
164                 Slot(const Slot& rhs);
165                 ~Slot();
166
167                 thread_mutex *context_mutex;
168                 cl_context context;
169                 typedef map<ustring, ProgramEntry> EntryMap;
170                 EntryMap programs;
171
172         };
173
174         /* key is combination of platform ID and device ID */
175         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
176
177         /* map of Slot objects */
178         typedef map<PlatformDevicePair, Slot> CacheMap;
179         CacheMap cache;
180
181         /* MD5 hash of the kernel source. */
182         string kernel_md5;
183
184         thread_mutex cache_lock;
185         thread_mutex kernel_md5_lock;
186
187         /* lazy instantiate */
188         static OpenCLCache& global_instance();
189
190 public:
191
192         enum ProgramName {
193                 OCL_DEV_BASE_PROGRAM,
194                 OCL_DEV_MEGAKERNEL_PROGRAM,
195         };
196
197         /* Lookup context in the cache. If this returns NULL, slot_locker
198          * will be holding a lock for the cache. slot_locker should refer to a
199          * default constructed thread_scoped_lock. */
200         static cl_context get_context(cl_platform_id platform,
201                                       cl_device_id device,
202                                       thread_scoped_lock& slot_locker);
203         /* Same as above. */
204         static cl_program get_program(cl_platform_id platform,
205                                       cl_device_id device,
206                                       ustring key,
207                                       thread_scoped_lock& slot_locker);
208
209         /* Store context in the cache. You MUST have tried to get the item before storing to it. */
210         static void store_context(cl_platform_id platform,
211                                   cl_device_id device,
212                                   cl_context context,
213                                   thread_scoped_lock& slot_locker);
214         /* Same as above. */
215         static void store_program(cl_platform_id platform,
216                                   cl_device_id device,
217                                   cl_program program,
218                                   ustring key,
219                                   thread_scoped_lock& slot_locker);
220
221         static string get_kernel_md5();
222 };
223
224 #define opencl_assert(stmt) \
225         { \
226                 cl_int err = stmt; \
227                 \
228                 if(err != CL_SUCCESS) { \
229                         string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
230                         if(error_msg == "") \
231                                 error_msg = message; \
232                         fprintf(stderr, "%s\n", message.c_str()); \
233                 } \
234         } (void)0
235
236 class OpenCLDeviceBase : public Device
237 {
238 public:
239         DedicatedTaskPool task_pool;
240         cl_context cxContext;
241         cl_command_queue cqCommandQueue;
242         cl_platform_id cpPlatform;
243         cl_device_id cdDevice;
244         cl_int ciErr;
245
246         class OpenCLProgram {
247         public:
248                 OpenCLProgram() : loaded(false), device(NULL) {}
249                 OpenCLProgram(OpenCLDeviceBase *device,
250                               const string& program_name,
251                               const string& kernel_name,
252                               const string& kernel_build_options,
253                               bool use_stdout = true);
254                 ~OpenCLProgram();
255
256                 void add_kernel(ustring name);
257                 void load();
258
259                 bool is_loaded() const { return loaded; }
260                 const string& get_log() const { return log; }
261                 void report_error();
262
263                 cl_kernel operator()();
264                 cl_kernel operator()(ustring name);
265
266                 void release();
267
268         private:
269                 bool build_kernel(const string *debug_src);
270                 bool compile_kernel(const string *debug_src);
271                 bool load_binary(const string& clbin, const string *debug_src = NULL);
272                 bool save_binary(const string& clbin);
273
274                 void add_log(const string& msg, bool is_debug);
275                 void add_error(const string& msg);
276
277                 bool loaded;
278                 cl_program program;
279                 OpenCLDeviceBase *device;
280
281                 /* Used for the OpenCLCache key. */
282                 string program_name;
283
284                 string kernel_file, kernel_build_options, device_md5;
285
286                 bool use_stdout;
287                 string log, error_msg;
288                 string compile_output;
289
290                 map<ustring, cl_kernel> kernels;
291         };
292
293         OpenCLProgram base_program, denoising_program;
294
295         typedef map<string, device_vector<uchar>*> ConstMemMap;
296         typedef map<string, device_ptr> MemMap;
297
298         ConstMemMap const_mem_map;
299         MemMap mem_map;
300         device_ptr null_mem;
301
302         bool device_initialized;
303         string platform_name;
304         string device_name;
305
306         bool opencl_error(cl_int err);
307         void opencl_error(const string& message);
308         void opencl_assert_err(cl_int err, const char* where);
309
310         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_);
311         ~OpenCLDeviceBase();
312
313         static void CL_CALLBACK context_notify_callback(const char *err_info,
314                 const void * /*private_info*/, size_t /*cb*/, void *user_data);
315
316         bool opencl_version_check();
317
318         string device_md5_hash(string kernel_custom_build_options = "");
319         bool load_kernels(const DeviceRequestedFeatures& requested_features);
320
321         /* Has to be implemented by the real device classes.
322          * The base device will then load all these programs. */
323         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
324                                   vector<OpenCLProgram*> &programs) = 0;
325
326         void mem_alloc(const char *name, device_memory& mem, MemoryType type);
327         void mem_copy_to(device_memory& mem);
328         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
329         void mem_zero(device_memory& mem);
330         void mem_free(device_memory& mem);
331
332         int mem_address_alignment();
333
334         void const_copy_to(const char *name, void *host, size_t size);
335         void tex_alloc(const char *name,
336                        device_memory& mem,
337                        InterpolationType /*interpolation*/,
338                        ExtensionType /*extension*/);
339         void tex_free(device_memory& mem);
340
341         size_t global_size_round_up(int group_size, int global_size);
342         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
343         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
344
345         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
346         void shader(DeviceTask& task);
347
348         void denoise(RenderTile& tile, const DeviceTask& task);
349
350         class OpenCLDeviceTask : public DeviceTask {
351         public:
352                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
353                 : DeviceTask(task)
354                 {
355                         run = function_bind(&OpenCLDeviceBase::thread_run,
356                                             device,
357                                             this);
358                 }
359         };
360
361         int get_split_task_count(DeviceTask& /*task*/)
362         {
363                 return 1;
364         }
365
366         void task_add(DeviceTask& task)
367         {
368                 task_pool.push(new OpenCLDeviceTask(this, task));
369         }
370
371         void task_wait()
372         {
373                 task_pool.wait();
374         }
375
376         void task_cancel()
377         {
378                 task_pool.cancel();
379         }
380
381         virtual void thread_run(DeviceTask * /*task*/) = 0;
382
383         virtual bool is_split_kernel() = 0;
384
385 protected:
386         string kernel_build_options(const string *debug_src = NULL);
387
388         void mem_zero_kernel(device_ptr ptr, size_t size);
389
390         bool denoising_non_local_means(device_ptr image_ptr,
391                                        device_ptr guide_ptr,
392                                        device_ptr variance_ptr,
393                                        device_ptr out_ptr,
394                                        DenoisingTask *task);
395         bool denoising_construct_transform(DenoisingTask *task);
396         bool denoising_reconstruct(device_ptr color_ptr,
397                                    device_ptr color_variance_ptr,
398                                    device_ptr output_ptr,
399                                    DenoisingTask *task);
400         bool denoising_combine_halves(device_ptr a_ptr,
401                                       device_ptr b_ptr,
402                                       device_ptr mean_ptr,
403                                       device_ptr variance_ptr,
404                                       int r, int4 rect,
405                                       DenoisingTask *task);
406         bool denoising_divide_shadow(device_ptr a_ptr,
407                                      device_ptr b_ptr,
408                                      device_ptr sample_variance_ptr,
409                                      device_ptr sv_variance_ptr,
410                                      device_ptr buffer_variance_ptr,
411                                      DenoisingTask *task);
412         bool denoising_get_feature(int mean_offset,
413                                    int variance_offset,
414                                    device_ptr mean_ptr,
415                                    device_ptr variance_ptr,
416                                    DenoisingTask *task);
417         bool denoising_detect_outliers(device_ptr image_ptr,
418                                        device_ptr variance_ptr,
419                                        device_ptr depth_ptr,
420                                        device_ptr output_ptr,
421                                        DenoisingTask *task);
422         bool denoising_set_tiles(device_ptr *buffers,
423                                  DenoisingTask *task);
424
425         device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
426         void mem_free_sub_ptr(device_ptr ptr);
427
428         class ArgumentWrapper {
429         public:
430                 ArgumentWrapper() : size(0), pointer(NULL)
431                 {
432                 }
433
434                 ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
435                                                            pointer((void*)(&argument.device_pointer))
436                 {
437                 }
438
439                 template<typename T>
440                 ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
441                                                               pointer((void*)(&argument.device_pointer))
442                 {
443                 }
444
445                 template<typename T>
446                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
447                                                pointer(&argument)
448                 {
449                 }
450
451                 ArgumentWrapper(int argument) : size(sizeof(int)),
452                                                 int_value(argument),
453                                                 pointer(&int_value)
454                 {
455                 }
456
457                 ArgumentWrapper(float argument) : size(sizeof(float)),
458                                                   float_value(argument),
459                                                   pointer(&float_value)
460                 {
461                 }
462
463                 size_t size;
464                 int int_value;
465                 float float_value;
466                 void *pointer;
467         };
468
469         /* TODO(sergey): In the future we can use variadic templates, once
470          * C++0x is allowed. Should allow to clean this up a bit.
471          */
472         int kernel_set_args(cl_kernel kernel,
473                             int start_argument_index,
474                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
475                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
476                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
477                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
478                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
479                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
480                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
481                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
482                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
483                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
484                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
485                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
486                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
487                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
488                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
489                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
490                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
491                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
492                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
493                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
494                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
495                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
496                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
497                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
498                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
499                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
500                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
501                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
502                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
503                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
504                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
505                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
506                             const ArgumentWrapper& arg33 = ArgumentWrapper());
507
508         void release_kernel_safe(cl_kernel kernel);
509         void release_mem_object_safe(cl_mem mem);
510         void release_program_safe(cl_program program);
511
512         /* ** Those guys are for workign around some compiler-specific bugs ** */
513
514         virtual cl_program load_cached_kernel(
515                 ustring key,
516                 thread_scoped_lock& cache_locker);
517
518         virtual void store_cached_kernel(
519                 cl_program program,
520                 ustring key,
521                 thread_scoped_lock& cache_locker);
522
523         virtual string build_options_for_base_program(
524                 const DeviceRequestedFeatures& /*requested_features*/);
525 };
526
527 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);
528 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background);
529
530 CCL_NAMESPACE_END
531
532 #endif