1dd4ad7df7f19179114e10928deea2b662b5c12b
[blender-staging.git] / intern / cycles / device / opencl / opencl.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/device.h"
20 #include "device/device_denoising.h"
21
22 #include "util/util_map.h"
23 #include "util/util_param.h"
24 #include "util/util_string.h"
25
26 #include "clew.h"
27
28 #include "device/opencl/memory_manager.h"
29
30 CCL_NAMESPACE_BEGIN
31
32 /* Disable workarounds, seems to be working fine on latest drivers. */
33 #define CYCLES_DISABLE_DRIVER_WORKAROUNDS
34
35 /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
36 #ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
37 /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
38 #  undef clEnqueueNDRangeKernel
39 #  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
40         CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
41         clFinish(a);
42
43 #  undef clEnqueueWriteBuffer
44 #  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
45         CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
46         clFinish(a);
47
48 #  undef clEnqueueReadBuffer
49 #  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
50         CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
51         clFinish(a);
52 #endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
53
54 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
55
56 struct OpenCLPlatformDevice {
57         OpenCLPlatformDevice(cl_platform_id platform_id,
58                              const string& platform_name,
59                              cl_device_id device_id,
60                              cl_device_type device_type,
61                              const string& device_name,
62                              const string& hardware_id)
63           : platform_id(platform_id),
64             platform_name(platform_name),
65             device_id(device_id),
66             device_type(device_type),
67             device_name(device_name),
68             hardware_id(hardware_id) {}
69         cl_platform_id platform_id;
70         string platform_name;
71         cl_device_id device_id;
72         cl_device_type device_type;
73         string device_name;
74         string hardware_id;
75 };
76
77 /* Contains all static OpenCL helper functions. */
78 class OpenCLInfo
79 {
80 public:
81         static cl_device_type device_type();
82         static bool use_debug();
83         static bool kernel_use_advanced_shading(const string& platform_name);
84         static bool kernel_use_split(const string& platform_name,
85                                      const cl_device_type device_type);
86         static bool device_supported(const string& platform_name,
87                                      const cl_device_id device_id);
88         static bool platform_version_check(cl_platform_id platform,
89                                            string *error = NULL);
90         static bool device_version_check(cl_device_id device,
91                                          string *error = NULL);
92         static string get_hardware_id(const string& platform_name,
93                                       cl_device_id device_id);
94         static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
95                                        bool force_all = false);
96         static bool use_single_program();
97
98         /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
99
100         /* Platform information. */
101         static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
102         static cl_uint get_num_platforms();
103
104         static bool get_platforms(vector<cl_platform_id> *platform_ids,
105                                   cl_int *error = NULL);
106         static vector<cl_platform_id> get_platforms();
107
108         static bool get_platform_name(cl_platform_id platform_id,
109                                       string *platform_name);
110         static string get_platform_name(cl_platform_id platform_id);
111
112         static bool get_num_platform_devices(cl_platform_id platform_id,
113                                              cl_device_type device_type,
114                                              cl_uint *num_devices,
115                                              cl_int *error = NULL);
116         static cl_uint get_num_platform_devices(cl_platform_id platform_id,
117                                                 cl_device_type device_type);
118
119         static bool get_platform_devices(cl_platform_id platform_id,
120                                          cl_device_type device_type,
121                                          vector<cl_device_id> *device_ids,
122                                          cl_int* error = NULL);
123         static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
124                                                          cl_device_type device_type);
125
126         /* Device information. */
127         static bool get_device_name(cl_device_id device_id,
128                                     string *device_name,
129                                     cl_int* error = NULL);
130
131         static string get_device_name(cl_device_id device_id);
132
133         static bool get_device_type(cl_device_id device_id,
134                                     cl_device_type *device_type,
135                                     cl_int* error = NULL);
136         static cl_device_type get_device_type(cl_device_id device_id);
137
138         static bool get_driver_version(cl_device_id device_id,
139                                        int *major,
140                                        int *minor,
141                                        cl_int* error = NULL);
142
143         static int mem_address_alignment(cl_device_id device_id);
144
145         /* Get somewhat more readable device name.
146          * Main difference is AMD OpenCL here which only gives code name
147          * for the regular device name. This will give more sane device
148          * name using some extensions.
149          */
150         static string get_readable_device_name(cl_device_id device_id);
151 };
152
153 /* Thread safe cache for contexts and programs.
154  */
155 class OpenCLCache
156 {
157         struct Slot
158         {
159                 struct ProgramEntry
160                 {
161                         ProgramEntry();
162                         ProgramEntry(const ProgramEntry& rhs);
163                         ~ProgramEntry();
164                         cl_program program;
165                         thread_mutex *mutex;
166                 };
167
168                 Slot();
169                 Slot(const Slot& rhs);
170                 ~Slot();
171
172                 thread_mutex *context_mutex;
173                 cl_context context;
174                 typedef map<ustring, ProgramEntry> EntryMap;
175                 EntryMap programs;
176
177         };
178
179         /* key is combination of platform ID and device ID */
180         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
181
182         /* map of Slot objects */
183         typedef map<PlatformDevicePair, Slot> CacheMap;
184         CacheMap cache;
185
186         /* MD5 hash of the kernel source. */
187         string kernel_md5;
188
189         thread_mutex cache_lock;
190         thread_mutex kernel_md5_lock;
191
192         /* lazy instantiate */
193         static OpenCLCache& global_instance();
194
195 public:
196
197         enum ProgramName {
198                 OCL_DEV_BASE_PROGRAM,
199                 OCL_DEV_MEGAKERNEL_PROGRAM,
200         };
201
202         /* Lookup context in the cache. If this returns NULL, slot_locker
203          * will be holding a lock for the cache. slot_locker should refer to a
204          * default constructed thread_scoped_lock. */
205         static cl_context get_context(cl_platform_id platform,
206                                       cl_device_id device,
207                                       thread_scoped_lock& slot_locker);
208         /* Same as above. */
209         static cl_program get_program(cl_platform_id platform,
210                                       cl_device_id device,
211                                       ustring key,
212                                       thread_scoped_lock& slot_locker);
213
214         /* Store context in the cache. You MUST have tried to get the item before storing to it. */
215         static void store_context(cl_platform_id platform,
216                                   cl_device_id device,
217                                   cl_context context,
218                                   thread_scoped_lock& slot_locker);
219         /* Same as above. */
220         static void store_program(cl_platform_id platform,
221                                   cl_device_id device,
222                                   cl_program program,
223                                   ustring key,
224                                   thread_scoped_lock& slot_locker);
225
226         static string get_kernel_md5();
227 };
228
229 #define opencl_device_assert(device, stmt) \
230         { \
231                 cl_int err = stmt; \
232                 \
233                 if(err != CL_SUCCESS) { \
234                         string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
235                         if((device)->error_message() == "") \
236                                 (device)->set_error(message); \
237                         fprintf(stderr, "%s\n", message.c_str()); \
238                 } \
239         } (void)0
240
241 #define opencl_assert(stmt) \
242         { \
243                 cl_int err = stmt; \
244                 \
245                 if(err != CL_SUCCESS) { \
246                         string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
247                         if(error_msg == "") \
248                                 error_msg = message; \
249                         fprintf(stderr, "%s\n", message.c_str()); \
250                 } \
251         } (void)0
252
253 class OpenCLDeviceBase : public Device
254 {
255 public:
256         DedicatedTaskPool task_pool;
257         cl_context cxContext;
258         cl_command_queue cqCommandQueue;
259         cl_platform_id cpPlatform;
260         cl_device_id cdDevice;
261         cl_int ciErr;
262
263         class OpenCLProgram {
264         public:
265                 OpenCLProgram() : loaded(false), device(NULL) {}
266                 OpenCLProgram(OpenCLDeviceBase *device,
267                               const string& program_name,
268                               const string& kernel_name,
269                               const string& kernel_build_options,
270                               bool use_stdout = true);
271                 ~OpenCLProgram();
272
273                 void add_kernel(ustring name);
274                 void load();
275
276                 bool is_loaded() const { return loaded; }
277                 const string& get_log() const { return log; }
278                 void report_error();
279
280                 cl_kernel operator()();
281                 cl_kernel operator()(ustring name);
282
283                 void release();
284
285         private:
286                 bool build_kernel(const string *debug_src);
287                 bool compile_kernel(const string *debug_src);
288                 bool load_binary(const string& clbin, const string *debug_src = NULL);
289                 bool save_binary(const string& clbin);
290
291                 void add_log(const string& msg, bool is_debug);
292                 void add_error(const string& msg);
293
294                 bool loaded;
295                 cl_program program;
296                 OpenCLDeviceBase *device;
297
298                 /* Used for the OpenCLCache key. */
299                 string program_name;
300
301                 string kernel_file, kernel_build_options, device_md5;
302
303                 bool use_stdout;
304                 string log, error_msg;
305                 string compile_output;
306
307                 map<ustring, cl_kernel> kernels;
308         };
309
310         OpenCLProgram base_program, denoising_program;
311
312         typedef map<string, device_vector<uchar>*> ConstMemMap;
313         typedef map<string, device_ptr> MemMap;
314
315         ConstMemMap const_mem_map;
316         MemMap mem_map;
317         device_ptr null_mem;
318
319         bool device_initialized;
320         string platform_name;
321         string device_name;
322
323         bool opencl_error(cl_int err);
324         void opencl_error(const string& message);
325         void opencl_assert_err(cl_int err, const char* where);
326
327         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_);
328         ~OpenCLDeviceBase();
329
330         static void CL_CALLBACK context_notify_callback(const char *err_info,
331                 const void * /*private_info*/, size_t /*cb*/, void *user_data);
332
333         bool opencl_version_check();
334
335         string device_md5_hash(string kernel_custom_build_options = "");
336         bool load_kernels(const DeviceRequestedFeatures& requested_features);
337
338         /* Has to be implemented by the real device classes.
339          * The base device will then load all these programs. */
340         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
341                                   vector<OpenCLProgram*> &programs) = 0;
342
343         void mem_alloc(device_memory& mem);
344         void mem_copy_to(device_memory& mem);
345         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
346         void mem_zero(device_memory& mem);
347         void mem_free(device_memory& mem);
348
349         int mem_address_alignment();
350
351         void const_copy_to(const char *name, void *host, size_t size);
352         void tex_alloc(device_memory& mem);
353         void tex_free(device_memory& mem);
354
355         size_t global_size_round_up(int group_size, int global_size);
356         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
357         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
358         void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
359
360         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
361         void shader(DeviceTask& task);
362
363         void denoise(RenderTile& tile, const DeviceTask& task);
364
365         class OpenCLDeviceTask : public DeviceTask {
366         public:
367                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
368                 : DeviceTask(task)
369                 {
370                         run = function_bind(&OpenCLDeviceBase::thread_run,
371                                             device,
372                                             this);
373                 }
374         };
375
376         int get_split_task_count(DeviceTask& /*task*/)
377         {
378                 return 1;
379         }
380
381         void task_add(DeviceTask& task)
382         {
383                 task_pool.push(new OpenCLDeviceTask(this, task));
384         }
385
386         void task_wait()
387         {
388                 task_pool.wait();
389         }
390
391         void task_cancel()
392         {
393                 task_pool.cancel();
394         }
395
396         virtual void thread_run(DeviceTask * /*task*/) = 0;
397
398         virtual bool is_split_kernel() = 0;
399
400 protected:
401         string kernel_build_options(const string *debug_src = NULL);
402
403         void mem_zero_kernel(device_ptr ptr, size_t size);
404
405         bool denoising_non_local_means(device_ptr image_ptr,
406                                        device_ptr guide_ptr,
407                                        device_ptr variance_ptr,
408                                        device_ptr out_ptr,
409                                        DenoisingTask *task);
410         bool denoising_construct_transform(DenoisingTask *task);
411         bool denoising_reconstruct(device_ptr color_ptr,
412                                    device_ptr color_variance_ptr,
413                                    device_ptr output_ptr,
414                                    DenoisingTask *task);
415         bool denoising_combine_halves(device_ptr a_ptr,
416                                       device_ptr b_ptr,
417                                       device_ptr mean_ptr,
418                                       device_ptr variance_ptr,
419                                       int r, int4 rect,
420                                       DenoisingTask *task);
421         bool denoising_divide_shadow(device_ptr a_ptr,
422                                      device_ptr b_ptr,
423                                      device_ptr sample_variance_ptr,
424                                      device_ptr sv_variance_ptr,
425                                      device_ptr buffer_variance_ptr,
426                                      DenoisingTask *task);
427         bool denoising_get_feature(int mean_offset,
428                                    int variance_offset,
429                                    device_ptr mean_ptr,
430                                    device_ptr variance_ptr,
431                                    DenoisingTask *task);
432         bool denoising_detect_outliers(device_ptr image_ptr,
433                                        device_ptr variance_ptr,
434                                        device_ptr depth_ptr,
435                                        device_ptr output_ptr,
436                                        DenoisingTask *task);
437         bool denoising_set_tiles(device_ptr *buffers,
438                                  DenoisingTask *task);
439
440         device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size);
441         void mem_free_sub_ptr(device_ptr ptr);
442
443         class ArgumentWrapper {
444         public:
445                 ArgumentWrapper() : size(0), pointer(NULL)
446                 {
447                 }
448
449                 ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
450                                                            pointer((void*)(&argument.device_pointer))
451                 {
452                 }
453
454                 template<typename T>
455                 ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
456                                                               pointer((void*)(&argument.device_pointer))
457                 {
458                 }
459
460                 template<typename T>
461                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
462                                                pointer(&argument)
463                 {
464                 }
465
466                 ArgumentWrapper(int argument) : size(sizeof(int)),
467                                                 int_value(argument),
468                                                 pointer(&int_value)
469                 {
470                 }
471
472                 ArgumentWrapper(float argument) : size(sizeof(float)),
473                                                   float_value(argument),
474                                                   pointer(&float_value)
475                 {
476                 }
477
478                 size_t size;
479                 int int_value;
480                 float float_value;
481                 void *pointer;
482         };
483
484         /* TODO(sergey): In the future we can use variadic templates, once
485          * C++0x is allowed. Should allow to clean this up a bit.
486          */
487         int kernel_set_args(cl_kernel kernel,
488                             int start_argument_index,
489                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
490                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
491                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
492                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
493                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
494                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
495                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
496                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
497                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
498                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
499                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
500                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
501                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
502                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
503                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
504                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
505                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
506                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
507                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
508                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
509                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
510                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
511                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
512                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
513                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
514                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
515                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
516                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
517                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
518                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
519                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
520                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
521                             const ArgumentWrapper& arg33 = ArgumentWrapper());
522
523         void release_kernel_safe(cl_kernel kernel);
524         void release_mem_object_safe(cl_mem mem);
525         void release_program_safe(cl_program program);
526
527         /* ** Those guys are for workign around some compiler-specific bugs ** */
528
529         virtual cl_program load_cached_kernel(
530                 ustring key,
531                 thread_scoped_lock& cache_locker);
532
533         virtual void store_cached_kernel(
534                 cl_program program,
535                 ustring key,
536                 thread_scoped_lock& cache_locker);
537
538         virtual string build_options_for_base_program(
539                 const DeviceRequestedFeatures& /*requested_features*/);
540
541 private:
542         MemoryManager memory_manager;
543         friend class MemoryManager;
544
545         static_assert_align(TextureInfo, 16);
546
547         vector<TextureInfo> texture_info;
548         device_memory texture_info_buffer;
549
550         typedef map<string, device_memory*> TexturesMap;
551         TexturesMap textures;
552
553         bool textures_need_update;
554
555 protected:
556         void flush_texture_buffers();
557 };
558
559 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);
560 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background);
561
562 CCL_NAMESPACE_END
563
564 #endif