Cycles: Make all #include statements relative to cycles source directory
[blender.git] / intern / cycles / device / opencl / opencl_split.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "render/buffers.h"
22
23 #include "kernel/kernel_types.h"
24 #include "kernel/split/kernel_split_data_types.h"
25
26 #include "device/device_split_kernel.h"
27
28 #include "util/util_logging.h"
29 #include "util/util_md5.h"
30 #include "util/util_path.h"
31 #include "util/util_time.h"
32
33 CCL_NAMESPACE_BEGIN
34
35 class OpenCLSplitKernel;
36
37 static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
38 {
39         string build_options = "-D__SPLIT_KERNEL__ ";
40         build_options += requested_features.get_build_options();
41
42         /* Set compute device build option. */
43         cl_device_type device_type;
44         OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr);
45         assert(device->ciErr == CL_SUCCESS);
46         if(device_type == CL_DEVICE_TYPE_GPU) {
47                 build_options += " -D__COMPUTE_DEVICE_GPU__";
48         }
49
50         return build_options;
51 }
52
53 /* OpenCLDeviceSplitKernel's declaration/definition. */
54 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
55 {
56 public:
57         DeviceSplitKernel *split_kernel;
58         OpenCLProgram program_data_init;
59         OpenCLProgram program_state_buffer_size;
60
61         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
62
63         ~OpenCLDeviceSplitKernel()
64         {
65                 task_pool.stop();
66
67                 /* Release kernels */
68                 program_data_init.release();
69
70                 delete split_kernel;
71         }
72
73         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
74                                   vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
75         {
76                 bool single_program = OpenCLInfo::use_single_program();
77                 program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
78                                                   single_program ? "split" : "split_data_init",
79                                                   single_program ? "kernel_split.cl" : "kernel_data_init.cl",
80                                                   get_build_options(this, requested_features));
81
82                 program_data_init.add_kernel(ustring("path_trace_data_init"));
83                 programs.push_back(&program_data_init);
84
85                 program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
86                                                   single_program ? "split" : "split_state_buffer_size",
87                                                   single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl",
88                                                   get_build_options(this, requested_features));
89                 program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
90                 programs.push_back(&program_state_buffer_size);
91
92                 return split_kernel->load_kernels(requested_features);
93         }
94
95         void thread_run(DeviceTask *task)
96         {
97                 if(task->type == DeviceTask::FILM_CONVERT) {
98                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
99                 }
100                 else if(task->type == DeviceTask::SHADER) {
101                         shader(*task);
102                 }
103                 else if(task->type == DeviceTask::PATH_TRACE) {
104                         RenderTile tile;
105
106                         /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
107                          * fetch its size.
108                          */
109                         typedef struct KernelGlobals {
110                                 ccl_constant KernelData *data;
111 #define KERNEL_TEX(type, ttype, name) \
112                                 ccl_global type *name;
113 #include "kernel/kernel_textures.h"
114 #undef KERNEL_TEX
115                                 SplitData split_data;
116                                 SplitParams split_param_data;
117                         } KernelGlobals;
118
119                         /* Allocate buffer for kernel globals */
120                         device_memory kgbuffer;
121                         kgbuffer.resize(sizeof(KernelGlobals));
122                         mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
123
124                         /* Keep rendering tiles until done. */
125                         while(task->acquire_tile(this, tile)) {
126                                 split_kernel->path_trace(task,
127                                                          tile,
128                                                          kgbuffer,
129                                                          *const_mem_map["__data"]);
130
131                                 /* Complete kernel execution before release tile. */
132                                 /* This helps in multi-device render;
133                                  * The device that reaches the critical-section function
134                                  * release_tile waits (stalling other devices from entering
135                                  * release_tile) for all kernels to complete. If device1 (a
136                                  * slow-render device) reaches release_tile first then it would
137                                  * stall device2 (a fast-render device) from proceeding to render
138                                  * next tile.
139                                  */
140                                 clFinish(cqCommandQueue);
141
142                                 task->release_tile(tile);
143                         }
144
145                         mem_free(kgbuffer);
146                 }
147         }
148
149 protected:
150         /* ** Those guys are for workign around some compiler-specific bugs ** */
151
152         string build_options_for_base_program(
153                 const DeviceRequestedFeatures& requested_features)
154         {
155                 return requested_features.get_build_options();
156         }
157
158         friend class OpenCLSplitKernel;
159         friend class OpenCLSplitKernelFunction;
160 };
161
162 class OpenCLSplitKernelFunction : public SplitKernelFunction {
163 public:
164         OpenCLDeviceSplitKernel* device;
165         OpenCLDeviceBase::OpenCLProgram program;
166
167         OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
168         ~OpenCLSplitKernelFunction() { program.release(); }
169
170         virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
171         {
172                 device->kernel_set_args(program(), 0, kg, data);
173
174                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
175                                                        program(),
176                                                        2,
177                                                        NULL,
178                                                        dim.global_size,
179                                                        dim.local_size,
180                                                        0,
181                                                        NULL,
182                                                        NULL);
183
184                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
185
186                 if(device->ciErr != CL_SUCCESS) {
187                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
188                                                        clewErrorString(device->ciErr));
189                         device->opencl_error(message);
190                         return false;
191                 }
192
193                 return true;
194         }
195 };
196
197 class OpenCLSplitKernel : public DeviceSplitKernel {
198         OpenCLDeviceSplitKernel *device;
199 public:
200         explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
201         }
202
203         virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
204                                                                const DeviceRequestedFeatures& requested_features)
205         {
206                 OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
207
208                 bool single_program = OpenCLInfo::use_single_program();
209                 kernel->program =
210                         OpenCLDeviceBase::OpenCLProgram(device,
211                                                         single_program ? "split" : "split_" + kernel_name,
212                                                         single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl",
213                                                         get_build_options(device, requested_features));
214
215                 kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
216                 kernel->program.load();
217
218                 if(!kernel->program.is_loaded()) {
219                         delete kernel;
220                         return NULL;
221                 }
222
223                 return kernel;
224         }
225
226         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
227         {
228                 device_vector<uint64_t> size_buffer;
229                 size_buffer.resize(1);
230                 device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
231
232                 uint threads = num_threads;
233                 device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
234
235                 size_t global_size = 64;
236                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
237                                                device->program_state_buffer_size(),
238                                                1,
239                                                NULL,
240                                                &global_size,
241                                                NULL,
242                                                0,
243                                                NULL,
244                                                NULL);
245
246                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
247
248                 device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
249                 device->mem_free(size_buffer);
250
251                 if(device->ciErr != CL_SUCCESS) {
252                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
253                                                        clewErrorString(device->ciErr));
254                         device->opencl_error(message);
255                         return 0;
256                 }
257
258                 return *size_buffer.get_data();
259         }
260
261         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
262                                                     RenderTile& rtile,
263                                                     int num_global_elements,
264                                                     device_memory& kernel_globals,
265                                                     device_memory& kernel_data,
266                                                     device_memory& split_data,
267                                                     device_memory& ray_state,
268                                                     device_memory& queue_index,
269                                                     device_memory& use_queues_flag,
270                                                     device_memory& work_pool_wgs
271                                                     )
272         {
273                 cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
274
275                 /* Set the range of samples to be processed for every ray in
276                  * path-regeneration logic.
277                  */
278                 cl_int start_sample = rtile.start_sample;
279                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
280
281                 cl_uint start_arg_index =
282                         device->kernel_set_args(device->program_data_init(),
283                                         0,
284                                         kernel_globals,
285                                         kernel_data,
286                                         split_data,
287                                         num_global_elements,
288                                         ray_state,
289                                         rtile.rng_state);
290
291 /* TODO(sergey): Avoid map lookup here. */
292 #define KERNEL_TEX(type, ttype, name) \
293         device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
294 #include "kernel/kernel_textures.h"
295 #undef KERNEL_TEX
296
297                 start_arg_index +=
298                         device->kernel_set_args(device->program_data_init(),
299                                         start_arg_index,
300                                         start_sample,
301                                         end_sample,
302                                         rtile.x,
303                                         rtile.y,
304                                         rtile.w,
305                                         rtile.h,
306                                         rtile.offset,
307                                         rtile.stride,
308                                         queue_index,
309                                         dQueue_size,
310                                         use_queues_flag,
311                                         work_pool_wgs,
312                                         rtile.num_samples,
313                                         rtile.buffer);
314
315                 /* Enqueue ckPathTraceKernel_data_init kernel. */
316                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
317                                                device->program_data_init(),
318                                                2,
319                                                NULL,
320                                                dim.global_size,
321                                                dim.local_size,
322                                                0,
323                                                NULL,
324                                                NULL);
325
326                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
327
328                 if(device->ciErr != CL_SUCCESS) {
329                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
330                                                        clewErrorString(device->ciErr));
331                         device->opencl_error(message);
332                         return false;
333                 }
334
335                 return true;
336         }
337
338         virtual int2 split_kernel_local_size()
339         {
340                 return make_int2(64, 1);
341         }
342
343         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
344         {
345                 cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
346                 /* Use small global size on CPU devices as it seems to be much faster. */
347                 if(type == CL_DEVICE_TYPE_CPU) {
348                         VLOG(1) << "Global size: (64, 64).";
349                         return make_int2(64, 64);
350                 }
351
352                 cl_ulong max_buffer_size;
353                 clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
354                 VLOG(1) << "Maximum device allocation size: "
355                         << string_human_readable_number(max_buffer_size) << " bytes. ("
356                         << string_human_readable_size(max_buffer_size) << ").";
357
358                 size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
359                 int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
360                 VLOG(1) << "Global size: " << global_size << ".";
361                 return global_size;
362         }
363 };
364
365 OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
366 : OpenCLDeviceBase(info, stats, background_)
367 {
368         split_kernel = new OpenCLSplitKernel(this);
369
370         background = background_;
371 }
372
373 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
374 {
375         return new OpenCLDeviceSplitKernel(info, stats, background);
376 }
377
378 CCL_NAMESPACE_END
379
380 #endif /* WITH_OPENCL */