c966ebe0c5ea403a21bd1b22ba15bd49abaa8d26
[blender-staging.git] / intern / cycles / device / opencl / opencl_split.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "render/buffers.h"
22
23 #include "kernel/kernel_types.h"
24 #include "kernel/split/kernel_split_data_types.h"
25
26 #include "device/device_split_kernel.h"
27
28 #include "util/util_algorithm.h"
29 #include "util/util_logging.h"
30 #include "util/util_md5.h"
31 #include "util/util_path.h"
32 #include "util/util_time.h"
33
34 CCL_NAMESPACE_BEGIN
35
36 class OpenCLSplitKernel;
37
38 static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
39 {
40         string build_options = "-D__SPLIT_KERNEL__ ";
41         build_options += requested_features.get_build_options();
42
43         /* Set compute device build option. */
44         cl_device_type device_type;
45         OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr);
46         assert(device->ciErr == CL_SUCCESS);
47         if(device_type == CL_DEVICE_TYPE_GPU) {
48                 build_options += " -D__COMPUTE_DEVICE_GPU__";
49         }
50
51         return build_options;
52 }
53
54 /* OpenCLDeviceSplitKernel's declaration/definition. */
55 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
56 {
57 public:
58         DeviceSplitKernel *split_kernel;
59         OpenCLProgram program_data_init;
60         OpenCLProgram program_state_buffer_size;
61
62         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
63
64         ~OpenCLDeviceSplitKernel()
65         {
66                 task_pool.stop();
67
68                 /* Release kernels */
69                 program_data_init.release();
70
71                 delete split_kernel;
72         }
73
74         virtual bool show_samples() const {
75                 return true;
76         }
77
78         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
79                                   vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
80         {
81                 bool single_program = OpenCLInfo::use_single_program();
82                 program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
83                                                   single_program ? "split" : "split_data_init",
84                                                   single_program ? "kernel_split.cl" : "kernel_data_init.cl",
85                                                   get_build_options(this, requested_features));
86
87                 program_data_init.add_kernel(ustring("path_trace_data_init"));
88                 programs.push_back(&program_data_init);
89
90                 program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
91                                                   single_program ? "split" : "split_state_buffer_size",
92                                                   single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl",
93                                                   get_build_options(this, requested_features));
94                 program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
95                 programs.push_back(&program_state_buffer_size);
96
97                 return split_kernel->load_kernels(requested_features);
98         }
99
100         void thread_run(DeviceTask *task)
101         {
102                 flush_texture_buffers();
103
104                 if(task->type == DeviceTask::FILM_CONVERT) {
105                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
106                 }
107                 else if(task->type == DeviceTask::SHADER) {
108                         shader(*task);
109                 }
110                 else if(task->type == DeviceTask::RENDER) {
111                         RenderTile tile;
112
113                         /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
114                          * fetch its size.
115                          */
116                         typedef struct KernelGlobals {
117                                 ccl_constant KernelData *data;
118                                 ccl_global char *buffers[8];
119
120 #define KERNEL_TEX(type, name) \
121                                 TextureInfo name;
122 #include "kernel/kernel_textures.h"
123 #undef KERNEL_TEX
124
125                                 SplitData split_data;
126                                 SplitParams split_param_data;
127                         } KernelGlobals;
128
129                         /* Allocate buffer for kernel globals */
130                         device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
131                         kgbuffer.resize(1);
132                         mem_alloc(kgbuffer);
133
134                         /* Keep rendering tiles until done. */
135                         while(task->acquire_tile(this, tile)) {
136                                 if(tile.task == RenderTile::PATH_TRACE) {
137                                         assert(tile.task == RenderTile::PATH_TRACE);
138                                         split_kernel->path_trace(task,
139                                                                  tile,
140                                                                  kgbuffer,
141                                                                  *const_mem_map["__data"]);
142
143                                         /* Complete kernel execution before release tile. */
144                                         /* This helps in multi-device render;
145                                          * The device that reaches the critical-section function
146                                          * release_tile waits (stalling other devices from entering
147                                          * release_tile) for all kernels to complete. If device1 (a
148                                          * slow-render device) reaches release_tile first then it would
149                                          * stall device2 (a fast-render device) from proceeding to render
150                                          * next tile.
151                                          */
152                                         clFinish(cqCommandQueue);
153                                 }
154                                 else if(tile.task == RenderTile::DENOISE) {
155                                         tile.sample = tile.start_sample + tile.num_samples;
156                                         denoise(tile, *task);
157                                         task->update_progress(&tile, tile.w*tile.h);
158                                 }
159
160                                 task->release_tile(tile);
161                         }
162
163                         mem_free(kgbuffer);
164                 }
165         }
166
167         bool is_split_kernel()
168         {
169                 return true;
170         }
171
172 protected:
173         /* ** Those guys are for workign around some compiler-specific bugs ** */
174
175         string build_options_for_base_program(
176                 const DeviceRequestedFeatures& requested_features)
177         {
178                 return requested_features.get_build_options();
179         }
180
181         friend class OpenCLSplitKernel;
182         friend class OpenCLSplitKernelFunction;
183 };
184
185 struct CachedSplitMemory {
186         int id;
187         device_memory *split_data;
188         device_memory *ray_state;
189         device_memory *queue_index;
190         device_memory *use_queues_flag;
191         device_memory *work_pools;
192         device_ptr *buffer;
193 };
194
195 class OpenCLSplitKernelFunction : public SplitKernelFunction {
196 public:
197         OpenCLDeviceSplitKernel* device;
198         OpenCLDeviceBase::OpenCLProgram program;
199         CachedSplitMemory& cached_memory;
200         int cached_id;
201
202         OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
203                         device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
204         {
205         }
206
207         ~OpenCLSplitKernelFunction()
208         {
209                 program.release();
210         }
211
212         virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
213         {
214                 if(cached_id != cached_memory.id) {
215                         cl_uint start_arg_index =
216                                 device->kernel_set_args(program(),
217                                                     0,
218                                                     kg,
219                                                     data,
220                                                     *cached_memory.split_data,
221                                                     *cached_memory.ray_state);
222
223                                 device->set_kernel_arg_buffers(program(), &start_arg_index);
224
225                         start_arg_index +=
226                                 device->kernel_set_args(program(),
227                                                     start_arg_index,
228                                                     *cached_memory.queue_index,
229                                                     *cached_memory.use_queues_flag,
230                                                     *cached_memory.work_pools,
231                                                     *cached_memory.buffer);
232
233                         cached_id = cached_memory.id;
234                 }
235
236                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
237                                                        program(),
238                                                        2,
239                                                        NULL,
240                                                        dim.global_size,
241                                                        dim.local_size,
242                                                        0,
243                                                        NULL,
244                                                        NULL);
245
246                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
247
248                 if(device->ciErr != CL_SUCCESS) {
249                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
250                                                        clewErrorString(device->ciErr));
251                         device->opencl_error(message);
252                         return false;
253                 }
254
255                 return true;
256         }
257 };
258
259 class OpenCLSplitKernel : public DeviceSplitKernel {
260         OpenCLDeviceSplitKernel *device;
261         CachedSplitMemory cached_memory;
262 public:
263         explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
264         }
265
266         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
267                                                                const DeviceRequestedFeatures& requested_features)
268         {
269                 OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
270
271                 bool single_program = OpenCLInfo::use_single_program();
272                 kernel->program =
273                         OpenCLDeviceBase::OpenCLProgram(device,
274                                                         single_program ? "split" : "split_" + kernel_name,
275                                                         single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl",
276                                                         get_build_options(device, requested_features));
277
278                 kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
279                 kernel->program.load();
280
281                 if(!kernel->program.is_loaded()) {
282                         delete kernel;
283                         return NULL;
284                 }
285
286                 return kernel;
287         }
288
289         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
290         {
291                 device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
292                 size_buffer.resize(1);
293                 device->mem_alloc(size_buffer);
294
295                 uint threads = num_threads;
296                 device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
297
298                 size_t global_size = 64;
299                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
300                                                device->program_state_buffer_size(),
301                                                1,
302                                                NULL,
303                                                &global_size,
304                                                NULL,
305                                                0,
306                                                NULL,
307                                                NULL);
308
309                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
310
311                 device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
312                 size_t size = size_buffer[0];
313                 device->mem_free(size_buffer);
314
315                 if(device->ciErr != CL_SUCCESS) {
316                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
317                                                        clewErrorString(device->ciErr));
318                         device->opencl_error(message);
319                         return 0;
320                 }
321
322                 return size;
323         }
324
325         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
326                                                     RenderTile& rtile,
327                                                     int num_global_elements,
328                                                     device_memory& kernel_globals,
329                                                     device_memory& kernel_data,
330                                                     device_memory& split_data,
331                                                     device_memory& ray_state,
332                                                     device_memory& queue_index,
333                                                     device_memory& use_queues_flag,
334                                                     device_memory& work_pool_wgs
335                                                     )
336         {
337                 cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
338
339                 /* Set the range of samples to be processed for every ray in
340                  * path-regeneration logic.
341                  */
342                 cl_int start_sample = rtile.start_sample;
343                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
344
345                 cl_uint start_arg_index =
346                         device->kernel_set_args(device->program_data_init(),
347                                         0,
348                                         kernel_globals,
349                                         kernel_data,
350                                         split_data,
351                                         num_global_elements,
352                                         ray_state);
353
354                         device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index);
355
356                 start_arg_index +=
357                         device->kernel_set_args(device->program_data_init(),
358                                         start_arg_index,
359                                         start_sample,
360                                         end_sample,
361                                         rtile.x,
362                                         rtile.y,
363                                         rtile.w,
364                                         rtile.h,
365                                         rtile.offset,
366                                         rtile.stride,
367                                         queue_index,
368                                         dQueue_size,
369                                         use_queues_flag,
370                                         work_pool_wgs,
371                                         rtile.num_samples,
372                                         rtile.buffer);
373
374                 /* Enqueue ckPathTraceKernel_data_init kernel. */
375                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
376                                                device->program_data_init(),
377                                                2,
378                                                NULL,
379                                                dim.global_size,
380                                                dim.local_size,
381                                                0,
382                                                NULL,
383                                                NULL);
384
385                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
386
387                 if(device->ciErr != CL_SUCCESS) {
388                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
389                                                        clewErrorString(device->ciErr));
390                         device->opencl_error(message);
391                         return false;
392                 }
393
394                 cached_memory.split_data = &split_data;
395                 cached_memory.ray_state = &ray_state;
396                 cached_memory.queue_index = &queue_index;
397                 cached_memory.use_queues_flag = &use_queues_flag;
398                 cached_memory.work_pools = &work_pool_wgs;
399                 cached_memory.buffer = &rtile.buffer;
400                 cached_memory.id++;
401
402                 return true;
403         }
404
405         virtual int2 split_kernel_local_size()
406         {
407                 return make_int2(64, 1);
408         }
409
410         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
411         {
412                 cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
413                 /* Use small global size on CPU devices as it seems to be much faster. */
414                 if(type == CL_DEVICE_TYPE_CPU) {
415                         VLOG(1) << "Global size: (64, 64).";
416                         return make_int2(64, 64);
417                 }
418
419                 cl_ulong max_buffer_size;
420                 clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
421
422                 if(DebugFlags().opencl.mem_limit) {
423                         max_buffer_size = min(max_buffer_size,
424                                               cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
425                 }
426
427                 VLOG(1) << "Maximum device allocation size: "
428                         << string_human_readable_number(max_buffer_size) << " bytes. ("
429                         << string_human_readable_size(max_buffer_size) << ").";
430
431                 size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
432                 int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
433                 VLOG(1) << "Global size: " << global_size << ".";
434                 return global_size;
435         }
436 };
437
438 OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
439 : OpenCLDeviceBase(info, stats, background_)
440 {
441         split_kernel = new OpenCLSplitKernel(this);
442
443         background = background_;
444 }
445
446 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
447 {
448         return new OpenCLDeviceSplitKernel(info, stats, background);
449 }
450
451 CCL_NAMESPACE_END
452
453 #endif /* WITH_OPENCL */