Cycles: Support multithreaded compilation of kernels
[blender.git] / intern / cycles / device / opencl / opencl.h
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/device.h"
20 #include "device/device_denoising.h"
21
22 #include "util/util_map.h"
23 #include "util/util_param.h"
24 #include "util/util_string.h"
25
26 #include "clew.h"
27
28 #include "device/opencl/memory_manager.h"
29
30 CCL_NAMESPACE_BEGIN
31
32 /* Disable workarounds, seems to be working fine on latest drivers. */
33 #define CYCLES_DISABLE_DRIVER_WORKAROUNDS
34
35 /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
36 #ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
37 /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
38 #  undef clEnqueueNDRangeKernel
39 #  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
40         CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
41         clFinish(a);
42
43 #  undef clEnqueueWriteBuffer
44 #  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
45         CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
46         clFinish(a);
47
48 #  undef clEnqueueReadBuffer
49 #  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
50         CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
51         clFinish(a);
52 #endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
53
54 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
55
56 struct OpenCLPlatformDevice {
57         OpenCLPlatformDevice(cl_platform_id platform_id,
58                              const string& platform_name,
59                              cl_device_id device_id,
60                              cl_device_type device_type,
61                              const string& device_name,
62                              const string& hardware_id,
63                                  const string& device_extensions)
64           : platform_id(platform_id),
65             platform_name(platform_name),
66             device_id(device_id),
67             device_type(device_type),
68             device_name(device_name),
69             hardware_id(hardware_id),
70             device_extensions(device_extensions) {}
71         cl_platform_id platform_id;
72         string platform_name;
73         cl_device_id device_id;
74         cl_device_type device_type;
75         string device_name;
76         string hardware_id;
77         string device_extensions;
78 };
79
80 /* Contains all static OpenCL helper functions. */
81 class OpenCLInfo
82 {
83 public:
84         static cl_device_type device_type();
85         static bool use_debug();
86         static bool kernel_use_advanced_shading(const string& platform_name);
87         static bool kernel_use_split(const string& platform_name,
88                                      const cl_device_type device_type);
89         static bool device_supported(const string& platform_name,
90                                      const cl_device_id device_id);
91         static bool platform_version_check(cl_platform_id platform,
92                                            string *error = NULL);
93         static bool device_version_check(cl_device_id device,
94                                          string *error = NULL);
95         static string get_hardware_id(const string& platform_name,
96                                       cl_device_id device_id);
97         static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
98                                        bool force_all = false);
99         static bool use_single_program();
100
101         /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
102
103         /* Platform information. */
104         static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
105         static cl_uint get_num_platforms();
106
107         static bool get_platforms(vector<cl_platform_id> *platform_ids,
108                                   cl_int *error = NULL);
109         static vector<cl_platform_id> get_platforms();
110
111         static bool get_platform_name(cl_platform_id platform_id,
112                                       string *platform_name);
113         static string get_platform_name(cl_platform_id platform_id);
114
115         static bool get_num_platform_devices(cl_platform_id platform_id,
116                                              cl_device_type device_type,
117                                              cl_uint *num_devices,
118                                              cl_int *error = NULL);
119         static cl_uint get_num_platform_devices(cl_platform_id platform_id,
120                                                 cl_device_type device_type);
121
122         static bool get_platform_devices(cl_platform_id platform_id,
123                                          cl_device_type device_type,
124                                          vector<cl_device_id> *device_ids,
125                                          cl_int* error = NULL);
126         static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
127                                                          cl_device_type device_type);
128
129         /* Device information. */
130         static bool get_device_name(cl_device_id device_id,
131                                     string *device_name,
132                                     cl_int* error = NULL);
133
134         static string get_device_name(cl_device_id device_id);
135
136         static bool get_device_extensions(cl_device_id device_id,
137                                           string *device_extensions,
138                                           cl_int* error = NULL);
139
140         static string get_device_extensions(cl_device_id device_id);
141
142         static bool get_device_type(cl_device_id device_id,
143                                     cl_device_type *device_type,
144                                     cl_int* error = NULL);
145         static cl_device_type get_device_type(cl_device_id device_id);
146
147         static bool get_driver_version(cl_device_id device_id,
148                                        int *major,
149                                        int *minor,
150                                        cl_int* error = NULL);
151
152         static int mem_sub_ptr_alignment(cl_device_id device_id);
153
154         /* Get somewhat more readable device name.
155          * Main difference is AMD OpenCL here which only gives code name
156          * for the regular device name. This will give more sane device
157          * name using some extensions.
158          */
159         static string get_readable_device_name(cl_device_id device_id);
160 };
161
162 /* Thread safe cache for contexts and programs.
163  */
164 class OpenCLCache
165 {
166         struct Slot
167         {
168                 struct ProgramEntry
169                 {
170                         ProgramEntry();
171                         ProgramEntry(const ProgramEntry& rhs);
172                         ~ProgramEntry();
173                         cl_program program;
174                         thread_mutex *mutex;
175                 };
176
177                 Slot();
178                 Slot(const Slot& rhs);
179                 ~Slot();
180
181                 thread_mutex *context_mutex;
182                 cl_context context;
183                 typedef map<ustring, ProgramEntry> EntryMap;
184                 EntryMap programs;
185
186         };
187
188         /* key is combination of platform ID and device ID */
189         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
190
191         /* map of Slot objects */
192         typedef map<PlatformDevicePair, Slot> CacheMap;
193         CacheMap cache;
194
195         /* MD5 hash of the kernel source. */
196         string kernel_md5;
197
198         thread_mutex cache_lock;
199         thread_mutex kernel_md5_lock;
200
201         /* lazy instantiate */
202         static OpenCLCache& global_instance();
203
204 public:
205
206         enum ProgramName {
207                 OCL_DEV_BASE_PROGRAM,
208                 OCL_DEV_MEGAKERNEL_PROGRAM,
209         };
210
211         /* Lookup context in the cache. If this returns NULL, slot_locker
212          * will be holding a lock for the cache. slot_locker should refer to a
213          * default constructed thread_scoped_lock. */
214         static cl_context get_context(cl_platform_id platform,
215                                       cl_device_id device,
216                                       thread_scoped_lock& slot_locker);
217         /* Same as above. */
218         static cl_program get_program(cl_platform_id platform,
219                                       cl_device_id device,
220                                       ustring key,
221                                       thread_scoped_lock& slot_locker);
222
223         /* Store context in the cache. You MUST have tried to get the item before storing to it. */
224         static void store_context(cl_platform_id platform,
225                                   cl_device_id device,
226                                   cl_context context,
227                                   thread_scoped_lock& slot_locker);
228         /* Same as above. */
229         static void store_program(cl_platform_id platform,
230                                   cl_device_id device,
231                                   cl_program program,
232                                   ustring key,
233                                   thread_scoped_lock& slot_locker);
234
235         static string get_kernel_md5();
236 };
237
238 #define opencl_device_assert(device, stmt) \
239         { \
240                 cl_int err = stmt; \
241                 \
242                 if(err != CL_SUCCESS) { \
243                         string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
244                         if((device)->error_message() == "") \
245                                 (device)->set_error(message); \
246                         fprintf(stderr, "%s\n", message.c_str()); \
247                 } \
248         } (void) 0
249
250 #define opencl_assert(stmt) \
251         { \
252                 cl_int err = stmt; \
253                 \
254                 if(err != CL_SUCCESS) { \
255                         string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
256                         if(error_msg == "") \
257                                 error_msg = message; \
258                         fprintf(stderr, "%s\n", message.c_str()); \
259                 } \
260         } (void) 0
261
262 class OpenCLDeviceBase : public Device
263 {
264 public:
265         DedicatedTaskPool task_pool;
266         cl_context cxContext;
267         cl_command_queue cqCommandQueue;
268         cl_platform_id cpPlatform;
269         cl_device_id cdDevice;
270         cl_int ciErr;
271         int device_num;
272
273         class OpenCLProgram {
274         public:
275                 OpenCLProgram() : loaded(false), program(NULL), device(NULL) {}
276                 OpenCLProgram(OpenCLDeviceBase *device,
277                               const string& program_name,
278                               const string& kernel_name,
279                               const string& kernel_build_options,
280                               bool use_stdout = true);
281                 ~OpenCLProgram();
282
283                 void add_kernel(ustring name);
284                 void load();
285
286                 bool is_loaded() const { return loaded; }
287                 const string& get_log() const { return log; }
288                 void report_error();
289
290                 cl_kernel operator()();
291                 cl_kernel operator()(ustring name);
292
293                 void release();
294
295         private:
296                 bool build_kernel(const string *debug_src);
297                 /* Build the program by calling the own process.
298                  * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
299                  * build calls internally if they come from the same process.
300                  * If that is not supported, this function just returns false.
301                  */
302                 bool compile_separate(const string& clbin);
303                 /* Build the program by calling OpenCL directly. */
304                 bool compile_kernel(const string *debug_src);
305                 /* Loading and saving the program from/to disk. */
306                 bool load_binary(const string& clbin, const string *debug_src = NULL);
307                 bool save_binary(const string& clbin);
308
309                 void add_log(const string& msg, bool is_debug);
310                 void add_error(const string& msg);
311
312                 bool loaded;
313                 cl_program program;
314                 OpenCLDeviceBase *device;
315
316                 /* Used for the OpenCLCache key. */
317                 string program_name;
318
319                 string kernel_file, kernel_build_options, device_md5;
320
321                 bool use_stdout;
322                 string log, error_msg;
323                 string compile_output;
324
325                 map<ustring, cl_kernel> kernels;
326         };
327
328         OpenCLProgram base_program, denoising_program;
329
330         typedef map<string, device_vector<uchar>*> ConstMemMap;
331         typedef map<string, device_ptr> MemMap;
332
333         ConstMemMap const_mem_map;
334         MemMap mem_map;
335         device_ptr null_mem;
336
337         bool device_initialized;
338         string platform_name;
339         string device_name;
340
341         bool opencl_error(cl_int err);
342         void opencl_error(const string& message);
343         void opencl_assert_err(cl_int err, const char* where);
344
345         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_);
346         ~OpenCLDeviceBase();
347
348         static void CL_CALLBACK context_notify_callback(const char *err_info,
349                 const void * /*private_info*/, size_t /*cb*/, void *user_data);
350
351         bool opencl_version_check();
352
353         string device_md5_hash(string kernel_custom_build_options = "");
354         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features);
355
356         /* Has to be implemented by the real device classes.
357          * The base device will then load all these programs. */
358         virtual bool add_kernel_programs(const DeviceRequestedFeatures& requested_features,
359                                          vector<OpenCLProgram*> &programs) = 0;
360
361         /* Get the name of the opencl program for the given kernel */
362         virtual const string get_opencl_program_name(bool single_program, const string& kernel_name) = 0;
363         /* Get the program file name to compile (*.cl) for the given kernel */
364         virtual const string get_opencl_program_filename(bool single_program, const string& kernel_name) = 0;
365
366         void mem_alloc(device_memory& mem);
367         void mem_copy_to(device_memory& mem);
368         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
369         void mem_zero(device_memory& mem);
370         void mem_free(device_memory& mem);
371
372         int mem_sub_ptr_alignment();
373
374         void const_copy_to(const char *name, void *host, size_t size);
375         void tex_alloc(device_memory& mem);
376         void tex_free(device_memory& mem);
377
378         size_t global_size_round_up(int group_size, int global_size);
379         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
380                             bool x_workgroups = false,
381                             size_t max_workgroup_size = -1);
382         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
383         void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
384
385         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
386         void shader(DeviceTask& task);
387
388         void denoise(RenderTile& tile, DenoisingTask& denoising);
389
390         class OpenCLDeviceTask : public DeviceTask {
391         public:
392                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
393                 : DeviceTask(task)
394                 {
395                         run = function_bind(&OpenCLDeviceBase::thread_run,
396                                             device,
397                                             this);
398                 }
399         };
400
401         int get_split_task_count(DeviceTask& /*task*/)
402         {
403                 return 1;
404         }
405
406         void task_add(DeviceTask& task)
407         {
408                 task_pool.push(new OpenCLDeviceTask(this, task));
409         }
410
411         void task_wait()
412         {
413                 task_pool.wait();
414         }
415
416         void task_cancel()
417         {
418                 task_pool.cancel();
419         }
420
421         virtual void thread_run(DeviceTask * /*task*/) = 0;
422
423         virtual bool is_split_kernel() = 0;
424
425 protected:
426         string kernel_build_options(const string *debug_src = NULL);
427
428         void mem_zero_kernel(device_ptr ptr, size_t size);
429
430         bool denoising_non_local_means(device_ptr image_ptr,
431                                        device_ptr guide_ptr,
432                                        device_ptr variance_ptr,
433                                        device_ptr out_ptr,
434                                        DenoisingTask *task);
435         bool denoising_construct_transform(DenoisingTask *task);
436         bool denoising_accumulate(device_ptr color_ptr,
437                                   device_ptr color_variance_ptr,
438                                   device_ptr scale_ptr,
439                                   int frame,
440                                   DenoisingTask *task);
441         bool denoising_solve(device_ptr output_ptr,
442                              DenoisingTask *task);
443         bool denoising_combine_halves(device_ptr a_ptr,
444                                       device_ptr b_ptr,
445                                       device_ptr mean_ptr,
446                                       device_ptr variance_ptr,
447                                       int r, int4 rect,
448                                       DenoisingTask *task);
449         bool denoising_divide_shadow(device_ptr a_ptr,
450                                      device_ptr b_ptr,
451                                      device_ptr sample_variance_ptr,
452                                      device_ptr sv_variance_ptr,
453                                      device_ptr buffer_variance_ptr,
454                                      DenoisingTask *task);
455         bool denoising_get_feature(int mean_offset,
456                                    int variance_offset,
457                                    device_ptr mean_ptr,
458                                    device_ptr variance_ptr,
459                                    float scale,
460                                    DenoisingTask *task);
461         bool denoising_write_feature(int to_offset,
462                                      device_ptr from_ptr,
463                                      device_ptr buffer_ptr,
464                                      DenoisingTask *task);
465         bool denoising_detect_outliers(device_ptr image_ptr,
466                                        device_ptr variance_ptr,
467                                        device_ptr depth_ptr,
468                                        device_ptr output_ptr,
469                                        DenoisingTask *task);
470
471         device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size);
472         void mem_free_sub_ptr(device_ptr ptr);
473
474         class ArgumentWrapper {
475         public:
476                 ArgumentWrapper() : size(0), pointer(NULL)
477                 {
478                 }
479
480                 ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
481                                                            pointer((void*)(&argument.device_pointer))
482                 {
483                 }
484
485                 template<typename T>
486                 ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
487                                                               pointer((void*)(&argument.device_pointer))
488                 {
489                 }
490
491                 template<typename T>
492                 ArgumentWrapper(device_only_memory<T>& argument) : size(sizeof(void*)),
493                                                                    pointer((void*)(&argument.device_pointer))
494                 {
495                 }
496                 template<typename T>
497                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
498                                                pointer(&argument)
499                 {
500                 }
501
502                 ArgumentWrapper(int argument) : size(sizeof(int)),
503                                                 int_value(argument),
504                                                 pointer(&int_value)
505                 {
506                 }
507
508                 ArgumentWrapper(float argument) : size(sizeof(float)),
509                                                   float_value(argument),
510                                                   pointer(&float_value)
511                 {
512                 }
513
514                 size_t size;
515                 int int_value;
516                 float float_value;
517                 void *pointer;
518         };
519
520         /* TODO(sergey): In the future we can use variadic templates, once
521          * C++0x is allowed. Should allow to clean this up a bit.
522          */
523         int kernel_set_args(cl_kernel kernel,
524                             int start_argument_index,
525                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
526                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
527                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
528                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
529                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
530                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
531                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
532                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
533                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
534                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
535                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
536                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
537                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
538                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
539                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
540                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
541                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
542                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
543                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
544                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
545                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
546                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
547                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
548                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
549                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
550                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
551                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
552                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
553                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
554                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
555                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
556                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
557                             const ArgumentWrapper& arg33 = ArgumentWrapper());
558
559         void release_kernel_safe(cl_kernel kernel);
560         void release_mem_object_safe(cl_mem mem);
561         void release_program_safe(cl_program program);
562
563         /* ** Those guys are for workign around some compiler-specific bugs ** */
564
565         virtual cl_program load_cached_kernel(
566                 ustring key,
567                 thread_scoped_lock& cache_locker);
568
569         virtual void store_cached_kernel(
570                 cl_program program,
571                 ustring key,
572                 thread_scoped_lock& cache_locker);
573
574         virtual string build_options_for_base_program(
575                 const DeviceRequestedFeatures& /*requested_features*/);
576
577 private:
578         MemoryManager memory_manager;
579         friend class MemoryManager;
580
581         static_assert_align(TextureInfo, 16);
582         device_vector<TextureInfo> texture_info;
583
584         typedef map<string, device_memory*> TexturesMap;
585         TexturesMap textures;
586
587         bool textures_need_update;
588
589 protected:
590         void flush_texture_buffers();
591 };
592
593 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background);
594 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background);
595
596 CCL_NAMESPACE_END
597
598 #endif