2d819080674c8d08306bbef0964b2eae6f5b4ee1
[blender-staging.git] / intern / cycles / device / opencl / opencl_split.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "render/buffers.h"
22
23 #include "kernel/kernel_types.h"
24 #include "kernel/split/kernel_split_data_types.h"
25
26 #include "device/device_split_kernel.h"
27
28 #include "util/util_algorithm.h"
29 #include "util/util_logging.h"
30 #include "util/util_md5.h"
31 #include "util/util_path.h"
32 #include "util/util_time.h"
33
34 CCL_NAMESPACE_BEGIN
35
36 class OpenCLSplitKernel;
37
38 namespace {
39
40 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
41  * fetch its size.
42  */
43 typedef struct KernelGlobalsDummy {
44         ccl_constant KernelData *data;
45         ccl_global char *buffers[8];
46
47 #define KERNEL_TEX(type, name) \
48         TextureInfo name;
49 #  include "kernel/kernel_textures.h"
50 #undef KERNEL_TEX
51         SplitData split_data;
52         SplitParams split_param_data;
53 } KernelGlobalsDummy;
54
55 }  // namespace
56
57 static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
58 {
59         string build_options = "-D__SPLIT_KERNEL__ ";
60         build_options += requested_features.get_build_options();
61
62         /* Set compute device build option. */
63         cl_device_type device_type;
64         OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr);
65         assert(device->ciErr == CL_SUCCESS);
66         if(device_type == CL_DEVICE_TYPE_GPU) {
67                 build_options += " -D__COMPUTE_DEVICE_GPU__";
68         }
69
70         return build_options;
71 }
72
73 /* OpenCLDeviceSplitKernel's declaration/definition. */
74 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
75 {
76 public:
77         DeviceSplitKernel *split_kernel;
78         OpenCLProgram program_data_init;
79         OpenCLProgram program_state_buffer_size;
80
81         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
82
83         ~OpenCLDeviceSplitKernel()
84         {
85                 task_pool.stop();
86
87                 /* Release kernels */
88                 program_data_init.release();
89
90                 delete split_kernel;
91         }
92
93         virtual bool show_samples() const {
94                 return true;
95         }
96
97         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
98                                   vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
99         {
100                 bool single_program = OpenCLInfo::use_single_program();
101                 program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
102                                                   single_program ? "split" : "split_data_init",
103                                                   single_program ? "kernel_split.cl" : "kernel_data_init.cl",
104                                                   get_build_options(this, requested_features));
105
106                 program_data_init.add_kernel(ustring("path_trace_data_init"));
107                 programs.push_back(&program_data_init);
108
109                 program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
110                                                   single_program ? "split" : "split_state_buffer_size",
111                                                   single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl",
112                                                   get_build_options(this, requested_features));
113                 program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
114                 programs.push_back(&program_state_buffer_size);
115
116                 return split_kernel->load_kernels(requested_features);
117         }
118
119         void thread_run(DeviceTask *task)
120         {
121                 flush_texture_buffers();
122
123                 if(task->type == DeviceTask::FILM_CONVERT) {
124                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
125                 }
126                 else if(task->type == DeviceTask::SHADER) {
127                         shader(*task);
128                 }
129                 else if(task->type == DeviceTask::RENDER) {
130                         RenderTile tile;
131                         DenoisingTask denoising(this);
132
133                         /* Allocate buffer for kernel globals */
134                         device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
135                         kgbuffer.alloc_to_device(1);
136
137                         /* Keep rendering tiles until done. */
138                         while(task->acquire_tile(this, tile)) {
139                                 if(tile.task == RenderTile::PATH_TRACE) {
140                                         assert(tile.task == RenderTile::PATH_TRACE);
141                                         scoped_timer timer(&tile.buffers->render_time);
142
143                                         split_kernel->path_trace(task,
144                                                                  tile,
145                                                                  kgbuffer,
146                                                                  *const_mem_map["__data"]);
147
148                                         /* Complete kernel execution before release tile. */
149                                         /* This helps in multi-device render;
150                                          * The device that reaches the critical-section function
151                                          * release_tile waits (stalling other devices from entering
152                                          * release_tile) for all kernels to complete. If device1 (a
153                                          * slow-render device) reaches release_tile first then it would
154                                          * stall device2 (a fast-render device) from proceeding to render
155                                          * next tile.
156                                          */
157                                         clFinish(cqCommandQueue);
158                                 }
159                                 else if(tile.task == RenderTile::DENOISE) {
160                                         tile.sample = tile.start_sample + tile.num_samples;
161                                         denoise(tile, denoising, *task);
162                                         task->update_progress(&tile, tile.w*tile.h);
163                                 }
164
165                                 task->release_tile(tile);
166                         }
167
168                         kgbuffer.free();
169                 }
170         }
171
172         bool is_split_kernel()
173         {
174                 return true;
175         }
176
177 protected:
178         /* ** Those guys are for workign around some compiler-specific bugs ** */
179
180         string build_options_for_base_program(
181                 const DeviceRequestedFeatures& requested_features)
182         {
183                 return requested_features.get_build_options();
184         }
185
186         friend class OpenCLSplitKernel;
187         friend class OpenCLSplitKernelFunction;
188 };
189
190 struct CachedSplitMemory {
191         int id;
192         device_memory *split_data;
193         device_memory *ray_state;
194         device_memory *queue_index;
195         device_memory *use_queues_flag;
196         device_memory *work_pools;
197         device_ptr *buffer;
198 };
199
200 class OpenCLSplitKernelFunction : public SplitKernelFunction {
201 public:
202         OpenCLDeviceSplitKernel* device;
203         OpenCLDeviceBase::OpenCLProgram program;
204         CachedSplitMemory& cached_memory;
205         int cached_id;
206
207         OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
208                         device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
209         {
210         }
211
212         ~OpenCLSplitKernelFunction()
213         {
214                 program.release();
215         }
216
217         virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
218         {
219                 if(cached_id != cached_memory.id) {
220                         cl_uint start_arg_index =
221                                 device->kernel_set_args(program(),
222                                                     0,
223                                                     kg,
224                                                     data,
225                                                     *cached_memory.split_data,
226                                                     *cached_memory.ray_state);
227
228                                 device->set_kernel_arg_buffers(program(), &start_arg_index);
229
230                         start_arg_index +=
231                                 device->kernel_set_args(program(),
232                                                     start_arg_index,
233                                                     *cached_memory.queue_index,
234                                                     *cached_memory.use_queues_flag,
235                                                     *cached_memory.work_pools,
236                                                     *cached_memory.buffer);
237
238                         cached_id = cached_memory.id;
239                 }
240
241                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
242                                                        program(),
243                                                        2,
244                                                        NULL,
245                                                        dim.global_size,
246                                                        dim.local_size,
247                                                        0,
248                                                        NULL,
249                                                        NULL);
250
251                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
252
253                 if(device->ciErr != CL_SUCCESS) {
254                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
255                                                        clewErrorString(device->ciErr));
256                         device->opencl_error(message);
257                         return false;
258                 }
259
260                 return true;
261         }
262 };
263
264 class OpenCLSplitKernel : public DeviceSplitKernel {
265         OpenCLDeviceSplitKernel *device;
266         CachedSplitMemory cached_memory;
267 public:
268         explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
269         }
270
271         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
272                                                                const DeviceRequestedFeatures& requested_features)
273         {
274                 OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
275
276                 bool single_program = OpenCLInfo::use_single_program();
277                 kernel->program =
278                         OpenCLDeviceBase::OpenCLProgram(device,
279                                                         single_program ? "split" : "split_" + kernel_name,
280                                                         single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl",
281                                                         get_build_options(device, requested_features));
282
283                 kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
284                 kernel->program.load();
285
286                 if(!kernel->program.is_loaded()) {
287                         delete kernel;
288                         return NULL;
289                 }
290
291                 return kernel;
292         }
293
294         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
295         {
296                 device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
297                 size_buffer.alloc(1);
298                 size_buffer.zero_to_device();
299
300                 uint threads = num_threads;
301                 device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
302
303                 size_t global_size = 64;
304                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
305                                                device->program_state_buffer_size(),
306                                                1,
307                                                NULL,
308                                                &global_size,
309                                                NULL,
310                                                0,
311                                                NULL,
312                                                NULL);
313
314                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
315
316                 size_buffer.copy_from_device(0, 1, 1);
317                 size_t size = size_buffer[0];
318                 size_buffer.free();
319
320                 if(device->ciErr != CL_SUCCESS) {
321                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
322                                                        clewErrorString(device->ciErr));
323                         device->opencl_error(message);
324                         return 0;
325                 }
326
327                 return size;
328         }
329
330         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
331                                                     RenderTile& rtile,
332                                                     int num_global_elements,
333                                                     device_memory& kernel_globals,
334                                                     device_memory& kernel_data,
335                                                     device_memory& split_data,
336                                                     device_memory& ray_state,
337                                                     device_memory& queue_index,
338                                                     device_memory& use_queues_flag,
339                                                     device_memory& work_pool_wgs
340                                                     )
341         {
342                 cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
343
344                 /* Set the range of samples to be processed for every ray in
345                  * path-regeneration logic.
346                  */
347                 cl_int start_sample = rtile.start_sample;
348                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
349
350                 cl_uint start_arg_index =
351                         device->kernel_set_args(device->program_data_init(),
352                                         0,
353                                         kernel_globals,
354                                         kernel_data,
355                                         split_data,
356                                         num_global_elements,
357                                         ray_state);
358
359                         device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index);
360
361                 start_arg_index +=
362                         device->kernel_set_args(device->program_data_init(),
363                                         start_arg_index,
364                                         start_sample,
365                                         end_sample,
366                                         rtile.x,
367                                         rtile.y,
368                                         rtile.w,
369                                         rtile.h,
370                                         rtile.offset,
371                                         rtile.stride,
372                                         queue_index,
373                                         dQueue_size,
374                                         use_queues_flag,
375                                         work_pool_wgs,
376                                         rtile.num_samples,
377                                         rtile.buffer);
378
379                 /* Enqueue ckPathTraceKernel_data_init kernel. */
380                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
381                                                device->program_data_init(),
382                                                2,
383                                                NULL,
384                                                dim.global_size,
385                                                dim.local_size,
386                                                0,
387                                                NULL,
388                                                NULL);
389
390                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
391
392                 if(device->ciErr != CL_SUCCESS) {
393                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
394                                                        clewErrorString(device->ciErr));
395                         device->opencl_error(message);
396                         return false;
397                 }
398
399                 cached_memory.split_data = &split_data;
400                 cached_memory.ray_state = &ray_state;
401                 cached_memory.queue_index = &queue_index;
402                 cached_memory.use_queues_flag = &use_queues_flag;
403                 cached_memory.work_pools = &work_pool_wgs;
404                 cached_memory.buffer = &rtile.buffer;
405                 cached_memory.id++;
406
407                 return true;
408         }
409
410         virtual int2 split_kernel_local_size()
411         {
412                 return make_int2(64, 1);
413         }
414
415         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
416         {
417                 cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
418                 /* Use small global size on CPU devices as it seems to be much faster. */
419                 if(type == CL_DEVICE_TYPE_CPU) {
420                         VLOG(1) << "Global size: (64, 64).";
421                         return make_int2(64, 64);
422                 }
423
424                 cl_ulong max_buffer_size;
425                 clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
426
427                 if(DebugFlags().opencl.mem_limit) {
428                         max_buffer_size = min(max_buffer_size,
429                                               cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
430                 }
431
432                 VLOG(1) << "Maximum device allocation size: "
433                         << string_human_readable_number(max_buffer_size) << " bytes. ("
434                         << string_human_readable_size(max_buffer_size) << ").";
435
436                 /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
437                 max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);
438
439                 size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
440                 int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
441                 VLOG(1) << "Global size: " << global_size << ".";
442                 return global_size;
443         }
444 };
445
446 OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
447 : OpenCLDeviceBase(info, stats, background_)
448 {
449         split_kernel = new OpenCLSplitKernel(this);
450
451         background = background_;
452 }
453
454 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
455 {
456         return new OpenCLDeviceSplitKernel(info, stats, background);
457 }
458
459 CCL_NAMESPACE_END
460
461 #endif /* WITH_OPENCL */