Cycles: Support multithreaded compilation of kernels
[blender.git] / intern / cycles / device / opencl / opencl_split.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "render/buffers.h"
22
23 #include "kernel/kernel_types.h"
24 #include "kernel/split/kernel_split_data_types.h"
25
26 #include "device/device_split_kernel.h"
27
28 #include "util/util_algorithm.h"
29 #include "util/util_debug.h"
30 #include "util/util_logging.h"
31 #include "util/util_md5.h"
32 #include "util/util_path.h"
33 #include "util/util_time.h"
34
35 CCL_NAMESPACE_BEGIN
36
37 class OpenCLSplitKernel;
38
39 namespace {
40
41 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
42  * fetch its size.
43  */
44 typedef struct KernelGlobalsDummy {
45         ccl_constant KernelData *data;
46         ccl_global char *buffers[8];
47
48 #define KERNEL_TEX(type, name) \
49         TextureInfo name;
50 #  include "kernel/kernel_textures.h"
51 #undef KERNEL_TEX
52         SplitData split_data;
53         SplitParams split_param_data;
54 } KernelGlobalsDummy;
55
56 }  // namespace
57
58 static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
59 {
60         string build_options = "-D__SPLIT_KERNEL__ ";
61         build_options += requested_features.get_build_options();
62
63         /* Set compute device build option. */
64         cl_device_type device_type;
65         OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr);
66         assert(device->ciErr == CL_SUCCESS);
67         if(device_type == CL_DEVICE_TYPE_GPU) {
68                 build_options += " -D__COMPUTE_DEVICE_GPU__";
69         }
70
71         return build_options;
72 }
73
74 /* OpenCLDeviceSplitKernel's declaration/definition. */
75 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
76 {
77 public:
78         DeviceSplitKernel *split_kernel;
79         OpenCLProgram program_data_init;
80         OpenCLProgram program_state_buffer_size;
81
82         OpenCLProgram program_split;
83
84         OpenCLProgram program_path_init;
85         OpenCLProgram program_scene_intersect;
86         OpenCLProgram program_lamp_emission;
87         OpenCLProgram program_do_volume;
88         OpenCLProgram program_queue_enqueue;
89         OpenCLProgram program_indirect_background;
90         OpenCLProgram program_shader_setup;
91         OpenCLProgram program_shader_sort;
92         OpenCLProgram program_shader_eval;
93         OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
94         OpenCLProgram program_subsurface_scatter;
95         OpenCLProgram program_direct_lighting;
96         OpenCLProgram program_shadow_blocked_ao;
97         OpenCLProgram program_shadow_blocked_dl;
98         OpenCLProgram program_enqueue_inactive;
99         OpenCLProgram program_next_iteration_setup;
100         OpenCLProgram program_indirect_subsurface;
101         OpenCLProgram program_buffer_update;
102
103         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_);
104
105         ~OpenCLDeviceSplitKernel()
106         {
107                 task_pool.stop();
108
109                 /* Release kernels */
110                 program_data_init.release();
111
112                 delete split_kernel;
113         }
114
115         virtual bool show_samples() const {
116                 return true;
117         }
118
119         virtual BVHLayoutMask get_bvh_layout_mask() const {
120                 return BVH_LAYOUT_BVH2;
121         }
122
123         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features)
124         {
125                 if (!OpenCLDeviceBase::load_kernels(requested_features)) {
126                         return false;
127                 }
128                 return split_kernel->load_kernels(requested_features);
129         }
130
131         const string fast_compiled_kernels =
132                 "path_init "
133                 "scene_intersect "
134                 "queue_enqueue "
135                 "shader_setup "
136                 "shader_sort "
137                 "enqueue_inactive "
138                 "next_iteration_setup "
139                 "indirect_subsurface "
140                 "buffer_update";
141
142         const string get_opencl_program_name(bool single_program, const string& kernel_name)
143         {
144                 if (single_program) {
145                         return "split";
146                 }
147                 else {
148                         if (fast_compiled_kernels.find(kernel_name) != std::string::npos) {
149                                 return "split_bundle";
150                         }
151                         else {
152                                 return "split_" + kernel_name;
153                         }
154                 }
155         }
156
157         const string get_opencl_program_filename(bool single_program, const string& kernel_name)
158         {
159                 if (single_program) {
160                         return "kernel_split.cl";
161                 }
162                 else {
163                         if (fast_compiled_kernels.find(kernel_name) != std::string::npos) {
164                                 return "kernel_split_bundle.cl";
165                         }
166                         else {
167                                 return "kernel_" + kernel_name + ".cl";
168                         }
169                 }
170         }
171
172         virtual bool add_kernel_programs(const DeviceRequestedFeatures& requested_features,
173                                   vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
174         {
175                 bool single_program = OpenCLInfo::use_single_program();
176                 program_data_init = OpenCLDeviceBase::OpenCLProgram(
177                         this,
178                         get_opencl_program_name(single_program, "data_init"),
179                         get_opencl_program_filename(single_program, "data_init"),
180                         get_build_options(this, requested_features));
181                 program_data_init.add_kernel(ustring("path_trace_data_init"));
182                 programs.push_back(&program_data_init);
183
184                 program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(
185                         this,
186                         get_opencl_program_name(single_program, "state_buffer_size"),
187                         get_opencl_program_filename(single_program, "state_buffer_size"),
188                         get_build_options(this, requested_features));
189
190                 program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
191                 programs.push_back(&program_state_buffer_size);
192
193
194 #define ADD_SPLIT_KERNEL_SINGLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name));
195 #define ADD_SPLIT_KERNEL_SPLIT_PROGRAM(kernel_name) \
196                         program_##kernel_name = \
197                                 OpenCLDeviceBase::OpenCLProgram(this, \
198                                                                                                 "split_"#kernel_name, \
199                                                                                                 "kernel_"#kernel_name".cl", \
200                                                                                                 get_build_options(this, requested_features)); \
201                         program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \
202                         programs.push_back(&program_##kernel_name);
203
204                 if (single_program) {
205                         program_split = OpenCLDeviceBase::OpenCLProgram(
206                                 this,
207                                 "split" ,
208                                 "kernel_split.cl",
209                                 get_build_options(this, requested_features));
210
211                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init);
212                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect);
213                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(lamp_emission);
214                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(do_volume);
215                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue);
216                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_background);
217                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup);
218                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort);
219                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_eval);
220                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(holdout_emission_blurring_pathtermination_ao);
221                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(subsurface_scatter);
222                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(direct_lighting);
223                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_ao);
224                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_dl);
225                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive);
226                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup);
227                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface);
228                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update);
229
230                         programs.push_back(&program_split);
231                 }
232                 else {
233                         /* Ordered with most complex kernels first, to reduce overall compile time. */
234                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(subsurface_scatter);
235                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(do_volume);
236                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_dl);
237                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_ao);
238                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(holdout_emission_blurring_pathtermination_ao);
239                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(lamp_emission);
240                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(direct_lighting);
241                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(indirect_background);
242                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shader_eval);
243
244                         /* Quick kernels bundled in a single program to reduce overhead of starting
245                          * Blender processes. */
246                         program_split = OpenCLDeviceBase::OpenCLProgram(
247                                 this,
248                                 "split_bundle" ,
249                                 "kernel_split_bundle.cl",
250                                 get_build_options(this, requested_features));
251
252                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init);
253                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect);
254                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue);
255                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup);
256                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort);
257                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive);
258                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup);
259                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface);
260                         ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update);
261                         programs.push_back(&program_split);
262                 }
263 #undef ADD_SPLIT_KERNEL_SPLIT_PROGRAM
264 #undef ADD_SPLIT_KERNEL_SINGLE_PROGRAM
265
266                 return true;
267         }
268
269         void thread_run(DeviceTask *task)
270         {
271                 flush_texture_buffers();
272
273                 if(task->type == DeviceTask::FILM_CONVERT) {
274                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
275                 }
276                 else if(task->type == DeviceTask::SHADER) {
277                         shader(*task);
278                 }
279                 else if(task->type == DeviceTask::RENDER) {
280                         RenderTile tile;
281                         DenoisingTask denoising(this, *task);
282
283                         /* Allocate buffer for kernel globals */
284                         device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
285                         kgbuffer.alloc_to_device(1);
286
287                         /* Keep rendering tiles until done. */
288                         while(task->acquire_tile(this, tile)) {
289                                 if(tile.task == RenderTile::PATH_TRACE) {
290                                         assert(tile.task == RenderTile::PATH_TRACE);
291                                         scoped_timer timer(&tile.buffers->render_time);
292
293                                         split_kernel->path_trace(task,
294                                                                  tile,
295                                                                  kgbuffer,
296                                                                  *const_mem_map["__data"]);
297
298                                         /* Complete kernel execution before release tile. */
299                                         /* This helps in multi-device render;
300                                          * The device that reaches the critical-section function
301                                          * release_tile waits (stalling other devices from entering
302                                          * release_tile) for all kernels to complete. If device1 (a
303                                          * slow-render device) reaches release_tile first then it would
304                                          * stall device2 (a fast-render device) from proceeding to render
305                                          * next tile.
306                                          */
307                                         clFinish(cqCommandQueue);
308                                 }
309                                 else if(tile.task == RenderTile::DENOISE) {
310                                         tile.sample = tile.start_sample + tile.num_samples;
311                                         denoise(tile, denoising);
312                                         task->update_progress(&tile, tile.w*tile.h);
313                                 }
314
315                                 task->release_tile(tile);
316                         }
317
318                         kgbuffer.free();
319                 }
320         }
321
322         bool is_split_kernel()
323         {
324                 return true;
325         }
326
327 protected:
328         /* ** Those guys are for workign around some compiler-specific bugs ** */
329
330         string build_options_for_base_program(
331                 const DeviceRequestedFeatures& requested_features)
332         {
333                 return requested_features.get_build_options();
334         }
335
336         friend class OpenCLSplitKernel;
337         friend class OpenCLSplitKernelFunction;
338 };
339
340 struct CachedSplitMemory {
341         int id;
342         device_memory *split_data;
343         device_memory *ray_state;
344         device_memory *queue_index;
345         device_memory *use_queues_flag;
346         device_memory *work_pools;
347         device_ptr *buffer;
348 };
349
350 class OpenCLSplitKernelFunction : public SplitKernelFunction {
351 public:
352         OpenCLDeviceSplitKernel* device;
353         OpenCLDeviceBase::OpenCLProgram program;
354         CachedSplitMemory& cached_memory;
355         int cached_id;
356
357         OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
358                         device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
359         {
360         }
361
362         ~OpenCLSplitKernelFunction()
363         {
364                 program.release();
365         }
366
367         virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
368         {
369                 if(cached_id != cached_memory.id) {
370                         cl_uint start_arg_index =
371                                 device->kernel_set_args(program(),
372                                                     0,
373                                                     kg,
374                                                     data,
375                                                     *cached_memory.split_data,
376                                                     *cached_memory.ray_state);
377
378                                 device->set_kernel_arg_buffers(program(), &start_arg_index);
379
380                         start_arg_index +=
381                                 device->kernel_set_args(program(),
382                                                     start_arg_index,
383                                                     *cached_memory.queue_index,
384                                                     *cached_memory.use_queues_flag,
385                                                     *cached_memory.work_pools,
386                                                     *cached_memory.buffer);
387
388                         cached_id = cached_memory.id;
389                 }
390
391                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
392                                                        program(),
393                                                        2,
394                                                        NULL,
395                                                        dim.global_size,
396                                                        dim.local_size,
397                                                        0,
398                                                        NULL,
399                                                        NULL);
400
401                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
402
403                 if(device->ciErr != CL_SUCCESS) {
404                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
405                                                        clewErrorString(device->ciErr));
406                         device->opencl_error(message);
407                         return false;
408                 }
409
410                 return true;
411         }
412 };
413
414 class OpenCLSplitKernel : public DeviceSplitKernel {
415         OpenCLDeviceSplitKernel *device;
416         CachedSplitMemory cached_memory;
417 public:
418         explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
419         }
420
421         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
422                                                                const DeviceRequestedFeatures& requested_features)
423         {
424                 OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
425
426                 bool single_program = OpenCLInfo::use_single_program();
427                 kernel->program =
428                         OpenCLDeviceBase::OpenCLProgram(device,
429                                                         device->get_opencl_program_name(single_program, kernel_name),
430                                                         device->get_opencl_program_filename(single_program, kernel_name),
431                                                         get_build_options(device, requested_features));
432
433                 kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
434                 kernel->program.load();
435
436                 if(!kernel->program.is_loaded()) {
437                         delete kernel;
438                         return NULL;
439                 }
440
441                 return kernel;
442         }
443
444         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
445         {
446                 device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
447                 size_buffer.alloc(1);
448                 size_buffer.zero_to_device();
449
450                 uint threads = num_threads;
451                 device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
452
453                 size_t global_size = 64;
454                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
455                                                device->program_state_buffer_size(),
456                                                1,
457                                                NULL,
458                                                &global_size,
459                                                NULL,
460                                                0,
461                                                NULL,
462                                                NULL);
463
464                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
465
466                 size_buffer.copy_from_device(0, 1, 1);
467                 size_t size = size_buffer[0];
468                 size_buffer.free();
469
470                 if(device->ciErr != CL_SUCCESS) {
471                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
472                                                        clewErrorString(device->ciErr));
473                         device->opencl_error(message);
474                         return 0;
475                 }
476
477                 return size;
478         }
479
480         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
481                                                     RenderTile& rtile,
482                                                     int num_global_elements,
483                                                     device_memory& kernel_globals,
484                                                     device_memory& kernel_data,
485                                                     device_memory& split_data,
486                                                     device_memory& ray_state,
487                                                     device_memory& queue_index,
488                                                     device_memory& use_queues_flag,
489                                                     device_memory& work_pool_wgs
490                                                     )
491         {
492                 cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
493
494                 /* Set the range of samples to be processed for every ray in
495                  * path-regeneration logic.
496                  */
497                 cl_int start_sample = rtile.start_sample;
498                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
499
500                 cl_uint start_arg_index =
501                         device->kernel_set_args(device->program_data_init(),
502                                         0,
503                                         kernel_globals,
504                                         kernel_data,
505                                         split_data,
506                                         num_global_elements,
507                                         ray_state);
508
509                         device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index);
510
511                 start_arg_index +=
512                         device->kernel_set_args(device->program_data_init(),
513                                         start_arg_index,
514                                         start_sample,
515                                         end_sample,
516                                         rtile.x,
517                                         rtile.y,
518                                         rtile.w,
519                                         rtile.h,
520                                         rtile.offset,
521                                         rtile.stride,
522                                         queue_index,
523                                         dQueue_size,
524                                         use_queues_flag,
525                                         work_pool_wgs,
526                                         rtile.num_samples,
527                                         rtile.buffer);
528
529                 /* Enqueue ckPathTraceKernel_data_init kernel. */
530                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
531                                                device->program_data_init(),
532                                                2,
533                                                NULL,
534                                                dim.global_size,
535                                                dim.local_size,
536                                                0,
537                                                NULL,
538                                                NULL);
539
540                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
541
542                 if(device->ciErr != CL_SUCCESS) {
543                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
544                                                        clewErrorString(device->ciErr));
545                         device->opencl_error(message);
546                         return false;
547                 }
548
549                 cached_memory.split_data = &split_data;
550                 cached_memory.ray_state = &ray_state;
551                 cached_memory.queue_index = &queue_index;
552                 cached_memory.use_queues_flag = &use_queues_flag;
553                 cached_memory.work_pools = &work_pool_wgs;
554                 cached_memory.buffer = &rtile.buffer;
555                 cached_memory.id++;
556
557                 return true;
558         }
559
560         virtual int2 split_kernel_local_size()
561         {
562                 return make_int2(64, 1);
563         }
564
565         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
566         {
567                 cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
568                 /* Use small global size on CPU devices as it seems to be much faster. */
569                 if(type == CL_DEVICE_TYPE_CPU) {
570                         VLOG(1) << "Global size: (64, 64).";
571                         return make_int2(64, 64);
572                 }
573
574                 cl_ulong max_buffer_size;
575                 clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
576
577                 if(DebugFlags().opencl.mem_limit) {
578                         max_buffer_size = min(max_buffer_size,
579                                               cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
580                 }
581
582                 VLOG(1) << "Maximum device allocation size: "
583                         << string_human_readable_number(max_buffer_size) << " bytes. ("
584                         << string_human_readable_size(max_buffer_size) << ").";
585
586                 /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
587                 max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);
588
589                 size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
590                 int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
591                 VLOG(1) << "Global size: " << global_size << ".";
592                 return global_size;
593         }
594 };
595
596 OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
597 : OpenCLDeviceBase(info, stats, profiler, background_)
598 {
599         split_kernel = new OpenCLSplitKernel(this);
600
601         background = background_;
602 }
603
604 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background)
605 {
606         return new OpenCLDeviceSplitKernel(info, stats, profiler, background);
607 }
608
609 CCL_NAMESPACE_END
610
611 #endif  /* WITH_OPENCL */