a44f5da3a32c5d18b153b8a87aa2d326ba1babad
[blender-staging.git] / intern / cycles / device / opencl / opencl_split.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "opencl.h"
20
21 #include "buffers.h"
22
23 #include "kernel_types.h"
24 #include "kernel_split_data.h"
25
26 #include "device_split_kernel.h"
27
28 #include "util_logging.h"
29 #include "util_md5.h"
30 #include "util_path.h"
31 #include "util_time.h"
32
33 CCL_NAMESPACE_BEGIN
34
35 class OpenCLSplitKernel;
36
37 static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
38 {
39         string build_options = "-D__SPLIT_KERNEL__ ";
40         build_options += requested_features.get_build_options();
41
42         /* Set compute device build option. */
43         cl_device_type device_type;
44         device->ciErr = clGetDeviceInfo(device->cdDevice,
45                                 CL_DEVICE_TYPE,
46                                 sizeof(cl_device_type),
47                                 &device_type,
48                                 NULL);
49         assert(device->ciErr == CL_SUCCESS);
50         if(device_type == CL_DEVICE_TYPE_GPU) {
51                 build_options += " -D__COMPUTE_DEVICE_GPU__";
52         }
53
54         return build_options;
55 }
56
57 /* OpenCLDeviceSplitKernel's declaration/definition. */
58 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
59 {
60 public:
61         DeviceSplitKernel *split_kernel;
62         OpenCLProgram program_data_init;
63         OpenCLProgram program_state_buffer_size;
64
65         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
66
67         ~OpenCLDeviceSplitKernel()
68         {
69                 task_pool.stop();
70
71                 /* Release kernels */
72                 program_data_init.release();
73
74                 delete split_kernel;
75         }
76
77         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
78                                   vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
79         {
80                 program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
81                                                   "split_data_init",
82                                                   "kernel_data_init.cl",
83                                                   get_build_options(this, requested_features));
84                 program_data_init.add_kernel(ustring("path_trace_data_init"));
85                 programs.push_back(&program_data_init);
86
87                 program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
88                                                   "split_state_buffer_size",
89                                                   "kernel_state_buffer_size.cl",
90                                                   get_build_options(this, requested_features));
91                 program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
92                 programs.push_back(&program_state_buffer_size);
93
94                 return split_kernel->load_kernels(requested_features);
95         }
96
97         void thread_run(DeviceTask *task)
98         {
99                 if(task->type == DeviceTask::FILM_CONVERT) {
100                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
101                 }
102                 else if(task->type == DeviceTask::SHADER) {
103                         shader(*task);
104                 }
105                 else if(task->type == DeviceTask::PATH_TRACE) {
106                         RenderTile tile;
107
108                         /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
109                          * fetch its size.
110                          */
111                         typedef struct KernelGlobals {
112                                 ccl_constant KernelData *data;
113 #define KERNEL_TEX(type, ttype, name) \
114                                 ccl_global type *name;
115 #include "kernel_textures.h"
116 #undef KERNEL_TEX
117                                 void *sd_input;
118                                 void *isect_shadow;
119                                 SplitData split_data;
120                                 SplitParams split_param_data;
121                         } KernelGlobals;
122
123                         /* Allocate buffer for kernel globals */
124                         device_memory kgbuffer;
125                         kgbuffer.resize(sizeof(KernelGlobals));
126                         mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
127
128                         /* Keep rendering tiles until done. */
129                         while(task->acquire_tile(this, tile)) {
130                                 split_kernel->path_trace(task,
131                                                  tile,
132                                                  kgbuffer,
133                                                  *const_mem_map["__data"]);
134
135                                 /* Complete kernel execution before release tile. */
136                                 /* This helps in multi-device render;
137                                  * The device that reaches the critical-section function
138                                  * release_tile waits (stalling other devices from entering
139                                  * release_tile) for all kernels to complete. If device1 (a
140                                  * slow-render device) reaches release_tile first then it would
141                                  * stall device2 (a fast-render device) from proceeding to render
142                                  * next tile.
143                                  */
144                                 clFinish(cqCommandQueue);
145
146                                 task->release_tile(tile);
147                         }
148
149                         mem_free(kgbuffer);
150                 }
151         }
152
153 protected:
154         /* ** Those guys are for workign around some compiler-specific bugs ** */
155
156         string build_options_for_base_program(
157                 const DeviceRequestedFeatures& requested_features)
158         {
159                 return requested_features.get_build_options();
160         }
161
162         friend class OpenCLSplitKernel;
163         friend class OpenCLSplitKernelFunction;
164 };
165
166 class OpenCLSplitKernelFunction : public SplitKernelFunction {
167 public:
168         OpenCLDeviceSplitKernel* device;
169         OpenCLDeviceBase::OpenCLProgram program;
170
171         OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
172         ~OpenCLSplitKernelFunction() { program.release(); }
173
174         virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
175         {
176                 device->kernel_set_args(program(), 0, kg, data);
177
178                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
179                                                        program(),
180                                                        2,
181                                                        NULL,
182                                                        dim.global_size,
183                                                        dim.local_size,
184                                                        0,
185                                                        NULL,
186                                                        NULL);
187
188                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
189
190                 if(device->ciErr != CL_SUCCESS) {
191                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
192                                                        clewErrorString(device->ciErr));
193                         device->opencl_error(message);
194                         return false;
195                 }
196
197                 return true;
198         }
199 };
200
201 class OpenCLSplitKernel : public DeviceSplitKernel {
202         OpenCLDeviceSplitKernel *device;
203 public:
204         explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
205         }
206
207         virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
208                                                                const DeviceRequestedFeatures& requested_features)
209         {
210                 OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
211
212                 kernel->program = OpenCLDeviceBase::OpenCLProgram(device,
213                                                 "split_" + kernel_name,
214                                                 "kernel_" + kernel_name + ".cl",
215                                                 get_build_options(device, requested_features));
216                 kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
217                 kernel->program.load();
218
219                 if(!kernel->program.is_loaded()) {
220                         delete kernel;
221                         return NULL;
222                 }
223
224                 return kernel;
225         }
226
227         virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
228         {
229                 device_vector<uint> size_buffer;
230                 size_buffer.resize(1);
231                 device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
232
233                 uint threads = num_threads;
234                 device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
235
236                 size_t global_size = 64;
237                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
238                                                device->program_state_buffer_size(),
239                                                1,
240                                                NULL,
241                                                &global_size,
242                                                NULL,
243                                                0,
244                                                NULL,
245                                                NULL);
246
247                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
248
249                 device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
250                 device->mem_free(size_buffer);
251
252                 if(device->ciErr != CL_SUCCESS) {
253                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
254                                                        clewErrorString(device->ciErr));
255                         device->opencl_error(message);
256                         return 0;
257                 }
258
259                 return *size_buffer.get_data();
260         }
261
262         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
263                                                     RenderTile& rtile,
264                                                     int num_global_elements,
265                                                     device_memory& kernel_globals,
266                                                     device_memory& kernel_data,
267                                                     device_memory& split_data,
268                                                     device_memory& ray_state,
269                                                     device_memory& queue_index,
270                                                     device_memory& use_queues_flag,
271                                                     device_memory& work_pool_wgs
272                                                     )
273         {
274                 cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
275
276                 /* Set the range of samples to be processed for every ray in
277                  * path-regeneration logic.
278                  */
279                 cl_int start_sample = rtile.start_sample;
280                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
281
282                 cl_uint start_arg_index =
283                         device->kernel_set_args(device->program_data_init(),
284                                         0,
285                                         kernel_globals,
286                                         kernel_data,
287                                                         split_data,
288                                         num_global_elements,
289                                                         ray_state,
290                                         rtile.rng_state);
291
292 /* TODO(sergey): Avoid map lookup here. */
293 #define KERNEL_TEX(type, ttype, name) \
294         device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
295 #include "kernel_textures.h"
296 #undef KERNEL_TEX
297
298                 start_arg_index +=
299                         device->kernel_set_args(device->program_data_init(),
300                                         start_arg_index,
301                                         start_sample,
302                                         end_sample,
303                                         rtile.x,
304                                         rtile.y,
305                                         rtile.w,
306                                         rtile.h,
307                                         rtile.offset,
308                                         rtile.stride,
309                                         queue_index,
310                                         dQueue_size,
311                                         use_queues_flag,
312                                         work_pool_wgs,
313                                         rtile.num_samples,
314                                                         rtile.buffer);
315
316                 /* Enqueue ckPathTraceKernel_data_init kernel. */
317                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
318                                                device->program_data_init(),
319                                                2,
320                                                NULL,
321                                                dim.global_size,
322                                                dim.local_size,
323                                                0,
324                                                NULL,
325                                                NULL);
326
327                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
328
329                 if(device->ciErr != CL_SUCCESS) {
330                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
331                                                        clewErrorString(device->ciErr));
332                         device->opencl_error(message);
333                         return false;
334                 }
335
336                 return true;
337         }
338
339         virtual int2 split_kernel_local_size()
340         {
341                 return make_int2(64, 1);
342         }
343
344         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
345         {
346                 size_t max_buffer_size;
347                 clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
348                 VLOG(1) << "Maximum device allocation side: "
349                         << string_human_readable_number(max_buffer_size) << " bytes. ("
350                         << string_human_readable_size(max_buffer_size) << ").";
351
352                 size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
353                 int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
354                 VLOG(1) << "Global size: " << global_size << ".";
355                 return global_size;
356         }
357 };
358
359 OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
360 : OpenCLDeviceBase(info, stats, background_)
361 {
362         split_kernel = new OpenCLSplitKernel(this);
363
364         background = background_;
365 }
366
367 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
368 {
369         return new OpenCLDeviceSplitKernel(info, stats, background);
370 }
371
372 CCL_NAMESPACE_END
373
374 #endif /* WITH_OPENCL */