5a2555f9f8084ab62b5ea16210e27bd0d238dc6c
[blender.git] / intern / cycles / device / opencl / opencl_split.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "render/buffers.h"
22
23 #include "kernel/kernel_types.h"
24 #include "kernel/split/kernel_split_data_types.h"
25
26 #include "device/device_split_kernel.h"
27
28 #include "util/util_algorithm.h"
29 #include "util/util_debug.h"
30 #include "util/util_logging.h"
31 #include "util/util_md5.h"
32 #include "util/util_path.h"
33 #include "util/util_time.h"
34
35 CCL_NAMESPACE_BEGIN
36
37 class OpenCLSplitKernel;
38
39 namespace {
40
41 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
42  * fetch its size.
43  */
44 typedef struct KernelGlobalsDummy {
45         ccl_constant KernelData *data;
46         ccl_global char *buffers[8];
47
48 #define KERNEL_TEX(type, name) \
49         TextureInfo name;
50 #  include "kernel/kernel_textures.h"
51 #undef KERNEL_TEX
52         SplitData split_data;
53         SplitParams split_param_data;
54 } KernelGlobalsDummy;
55
56 }  // namespace
57
58 static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
59 {
60         string build_options = "-D__SPLIT_KERNEL__ ";
61         build_options += requested_features.get_build_options();
62
63         /* Set compute device build option. */
64         cl_device_type device_type;
65         OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr);
66         assert(device->ciErr == CL_SUCCESS);
67         if(device_type == CL_DEVICE_TYPE_GPU) {
68                 build_options += " -D__COMPUTE_DEVICE_GPU__";
69         }
70
71         return build_options;
72 }
73
74 /* OpenCLDeviceSplitKernel's declaration/definition. */
75 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
76 {
77 public:
78         DeviceSplitKernel *split_kernel;
79         OpenCLProgram program_data_init;
80         OpenCLProgram program_state_buffer_size;
81
82         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_);
83
84         ~OpenCLDeviceSplitKernel()
85         {
86                 task_pool.stop();
87
88                 /* Release kernels */
89                 program_data_init.release();
90
91                 delete split_kernel;
92         }
93
94         virtual bool show_samples() const {
95                 return true;
96         }
97
98         virtual BVHLayoutMask get_bvh_layout_mask() const {
99                 return BVH_LAYOUT_BVH2;
100         }
101
102         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
103                                   vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
104         {
105                 bool single_program = OpenCLInfo::use_single_program();
106                 program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
107                                                   single_program ? "split" : "split_data_init",
108                                                   single_program ? "kernel_split.cl" : "kernel_data_init.cl",
109                                                   get_build_options(this, requested_features));
110
111                 program_data_init.add_kernel(ustring("path_trace_data_init"));
112                 programs.push_back(&program_data_init);
113
114                 program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
115                                                   single_program ? "split" : "split_state_buffer_size",
116                                                   single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl",
117                                                   get_build_options(this, requested_features));
118                 program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
119                 programs.push_back(&program_state_buffer_size);
120
121                 return split_kernel->load_kernels(requested_features);
122         }
123
124         void thread_run(DeviceTask *task)
125         {
126                 flush_texture_buffers();
127
128                 if(task->type == DeviceTask::FILM_CONVERT) {
129                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
130                 }
131                 else if(task->type == DeviceTask::SHADER) {
132                         shader(*task);
133                 }
134                 else if(task->type == DeviceTask::RENDER) {
135                         RenderTile tile;
136                         DenoisingTask denoising(this, *task);
137
138                         /* Allocate buffer for kernel globals */
139                         device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
140                         kgbuffer.alloc_to_device(1);
141
142                         /* Keep rendering tiles until done. */
143                         while(task->acquire_tile(this, tile)) {
144                                 if(tile.task == RenderTile::PATH_TRACE) {
145                                         assert(tile.task == RenderTile::PATH_TRACE);
146                                         scoped_timer timer(&tile.buffers->render_time);
147
148                                         split_kernel->path_trace(task,
149                                                                  tile,
150                                                                  kgbuffer,
151                                                                  *const_mem_map["__data"]);
152
153                                         /* Complete kernel execution before release tile. */
154                                         /* This helps in multi-device render;
155                                          * The device that reaches the critical-section function
156                                          * release_tile waits (stalling other devices from entering
157                                          * release_tile) for all kernels to complete. If device1 (a
158                                          * slow-render device) reaches release_tile first then it would
159                                          * stall device2 (a fast-render device) from proceeding to render
160                                          * next tile.
161                                          */
162                                         clFinish(cqCommandQueue);
163                                 }
164                                 else if(tile.task == RenderTile::DENOISE) {
165                                         tile.sample = tile.start_sample + tile.num_samples;
166                                         denoise(tile, denoising);
167                                         task->update_progress(&tile, tile.w*tile.h);
168                                 }
169
170                                 task->release_tile(tile);
171                         }
172
173                         kgbuffer.free();
174                 }
175         }
176
177         bool is_split_kernel()
178         {
179                 return true;
180         }
181
182 protected:
183         /* ** Those guys are for workign around some compiler-specific bugs ** */
184
185         string build_options_for_base_program(
186                 const DeviceRequestedFeatures& requested_features)
187         {
188                 return requested_features.get_build_options();
189         }
190
191         friend class OpenCLSplitKernel;
192         friend class OpenCLSplitKernelFunction;
193 };
194
195 struct CachedSplitMemory {
196         int id;
197         device_memory *split_data;
198         device_memory *ray_state;
199         device_memory *queue_index;
200         device_memory *use_queues_flag;
201         device_memory *work_pools;
202         device_ptr *buffer;
203 };
204
205 class OpenCLSplitKernelFunction : public SplitKernelFunction {
206 public:
207         OpenCLDeviceSplitKernel* device;
208         OpenCLDeviceBase::OpenCLProgram program;
209         CachedSplitMemory& cached_memory;
210         int cached_id;
211
212         OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
213                         device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
214         {
215         }
216
217         ~OpenCLSplitKernelFunction()
218         {
219                 program.release();
220         }
221
222         virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
223         {
224                 if(cached_id != cached_memory.id) {
225                         cl_uint start_arg_index =
226                                 device->kernel_set_args(program(),
227                                                     0,
228                                                     kg,
229                                                     data,
230                                                     *cached_memory.split_data,
231                                                     *cached_memory.ray_state);
232
233                                 device->set_kernel_arg_buffers(program(), &start_arg_index);
234
235                         start_arg_index +=
236                                 device->kernel_set_args(program(),
237                                                     start_arg_index,
238                                                     *cached_memory.queue_index,
239                                                     *cached_memory.use_queues_flag,
240                                                     *cached_memory.work_pools,
241                                                     *cached_memory.buffer);
242
243                         cached_id = cached_memory.id;
244                 }
245
246                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
247                                                        program(),
248                                                        2,
249                                                        NULL,
250                                                        dim.global_size,
251                                                        dim.local_size,
252                                                        0,
253                                                        NULL,
254                                                        NULL);
255
256                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
257
258                 if(device->ciErr != CL_SUCCESS) {
259                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
260                                                        clewErrorString(device->ciErr));
261                         device->opencl_error(message);
262                         return false;
263                 }
264
265                 return true;
266         }
267 };
268
269 class OpenCLSplitKernel : public DeviceSplitKernel {
270         OpenCLDeviceSplitKernel *device;
271         CachedSplitMemory cached_memory;
272 public:
273         explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
274         }
275
276         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
277                                                                const DeviceRequestedFeatures& requested_features)
278         {
279                 OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
280
281                 bool single_program = OpenCLInfo::use_single_program();
282                 kernel->program =
283                         OpenCLDeviceBase::OpenCLProgram(device,
284                                                         single_program ? "split" : "split_" + kernel_name,
285                                                         single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl",
286                                                         get_build_options(device, requested_features));
287
288                 kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
289                 kernel->program.load();
290
291                 if(!kernel->program.is_loaded()) {
292                         delete kernel;
293                         return NULL;
294                 }
295
296                 return kernel;
297         }
298
299         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
300         {
301                 device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
302                 size_buffer.alloc(1);
303                 size_buffer.zero_to_device();
304
305                 uint threads = num_threads;
306                 device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
307
308                 size_t global_size = 64;
309                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
310                                                device->program_state_buffer_size(),
311                                                1,
312                                                NULL,
313                                                &global_size,
314                                                NULL,
315                                                0,
316                                                NULL,
317                                                NULL);
318
319                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
320
321                 size_buffer.copy_from_device(0, 1, 1);
322                 size_t size = size_buffer[0];
323                 size_buffer.free();
324
325                 if(device->ciErr != CL_SUCCESS) {
326                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
327                                                        clewErrorString(device->ciErr));
328                         device->opencl_error(message);
329                         return 0;
330                 }
331
332                 return size;
333         }
334
335         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
336                                                     RenderTile& rtile,
337                                                     int num_global_elements,
338                                                     device_memory& kernel_globals,
339                                                     device_memory& kernel_data,
340                                                     device_memory& split_data,
341                                                     device_memory& ray_state,
342                                                     device_memory& queue_index,
343                                                     device_memory& use_queues_flag,
344                                                     device_memory& work_pool_wgs
345                                                     )
346         {
347                 cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
348
349                 /* Set the range of samples to be processed for every ray in
350                  * path-regeneration logic.
351                  */
352                 cl_int start_sample = rtile.start_sample;
353                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
354
355                 cl_uint start_arg_index =
356                         device->kernel_set_args(device->program_data_init(),
357                                         0,
358                                         kernel_globals,
359                                         kernel_data,
360                                         split_data,
361                                         num_global_elements,
362                                         ray_state);
363
364                         device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index);
365
366                 start_arg_index +=
367                         device->kernel_set_args(device->program_data_init(),
368                                         start_arg_index,
369                                         start_sample,
370                                         end_sample,
371                                         rtile.x,
372                                         rtile.y,
373                                         rtile.w,
374                                         rtile.h,
375                                         rtile.offset,
376                                         rtile.stride,
377                                         queue_index,
378                                         dQueue_size,
379                                         use_queues_flag,
380                                         work_pool_wgs,
381                                         rtile.num_samples,
382                                         rtile.buffer);
383
384                 /* Enqueue ckPathTraceKernel_data_init kernel. */
385                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
386                                                device->program_data_init(),
387                                                2,
388                                                NULL,
389                                                dim.global_size,
390                                                dim.local_size,
391                                                0,
392                                                NULL,
393                                                NULL);
394
395                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
396
397                 if(device->ciErr != CL_SUCCESS) {
398                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
399                                                        clewErrorString(device->ciErr));
400                         device->opencl_error(message);
401                         return false;
402                 }
403
404                 cached_memory.split_data = &split_data;
405                 cached_memory.ray_state = &ray_state;
406                 cached_memory.queue_index = &queue_index;
407                 cached_memory.use_queues_flag = &use_queues_flag;
408                 cached_memory.work_pools = &work_pool_wgs;
409                 cached_memory.buffer = &rtile.buffer;
410                 cached_memory.id++;
411
412                 return true;
413         }
414
415         virtual int2 split_kernel_local_size()
416         {
417                 return make_int2(64, 1);
418         }
419
420         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
421         {
422                 cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
423                 /* Use small global size on CPU devices as it seems to be much faster. */
424                 if(type == CL_DEVICE_TYPE_CPU) {
425                         VLOG(1) << "Global size: (64, 64).";
426                         return make_int2(64, 64);
427                 }
428
429                 cl_ulong max_buffer_size;
430                 clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
431
432                 if(DebugFlags().opencl.mem_limit) {
433                         max_buffer_size = min(max_buffer_size,
434                                               cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
435                 }
436
437                 VLOG(1) << "Maximum device allocation size: "
438                         << string_human_readable_number(max_buffer_size) << " bytes. ("
439                         << string_human_readable_size(max_buffer_size) << ").";
440
441                 /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
442                 max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);
443
444                 size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
445                 int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
446                 VLOG(1) << "Global size: " << global_size << ".";
447                 return global_size;
448         }
449 };
450
451 OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
452 : OpenCLDeviceBase(info, stats, profiler, background_)
453 {
454         split_kernel = new OpenCLSplitKernel(this);
455
456         background = background_;
457 }
458
459 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background)
460 {
461         return new OpenCLDeviceSplitKernel(info, stats, profiler, background);
462 }
463
464 CCL_NAMESPACE_END
465
466 #endif  /* WITH_OPENCL */