Cycles: Cleanup, remove residue of previous split kernel data
[blender-staging.git] / intern / cycles / device / opencl / opencl_split.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "opencl.h"
20
21 #include "buffers.h"
22
23 #include "kernel_types.h"
24 #include "kernel_split_data.h"
25
26 #include "device_split_kernel.h"
27
28 #include "util_logging.h"
29 #include "util_md5.h"
30 #include "util_path.h"
31 #include "util_time.h"
32
33 CCL_NAMESPACE_BEGIN
34
35 class OpenCLSplitKernel;
36
37 static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
38 {
39         string build_options = "-D__SPLIT_KERNEL__ ";
40         build_options += requested_features.get_build_options();
41
42         /* Set compute device build option. */
43         cl_device_type device_type;
44         device->ciErr = clGetDeviceInfo(device->cdDevice,
45                                 CL_DEVICE_TYPE,
46                                 sizeof(cl_device_type),
47                                 &device_type,
48                                 NULL);
49         assert(device->ciErr == CL_SUCCESS);
50         if(device_type == CL_DEVICE_TYPE_GPU) {
51                 build_options += " -D__COMPUTE_DEVICE_GPU__";
52         }
53
54         return build_options;
55 }
56
57 /* OpenCLDeviceSplitKernel's declaration/definition. */
58 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
59 {
60 public:
61         DeviceSplitKernel *split_kernel;
62         OpenCLProgram program_data_init;
63         OpenCLProgram program_state_buffer_size;
64
65         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
66
67         ~OpenCLDeviceSplitKernel()
68         {
69                 task_pool.stop();
70
71                 /* Release kernels */
72                 program_data_init.release();
73
74                 delete split_kernel;
75         }
76
77         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
78                                   vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
79         {
80                 program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
81                                                   "split_data_init",
82                                                   "kernel_data_init.cl",
83                                                   get_build_options(this, requested_features));
84                 program_data_init.add_kernel(ustring("path_trace_data_init"));
85                 programs.push_back(&program_data_init);
86
87                 program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
88                                                   "split_state_buffer_size",
89                                                   "kernel_state_buffer_size.cl",
90                                                   get_build_options(this, requested_features));
91                 program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
92                 programs.push_back(&program_state_buffer_size);
93
94                 return split_kernel->load_kernels(requested_features);
95         }
96
97         void thread_run(DeviceTask *task)
98         {
99                 if(task->type == DeviceTask::FILM_CONVERT) {
100                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
101                 }
102                 else if(task->type == DeviceTask::SHADER) {
103                         shader(*task);
104                 }
105                 else if(task->type == DeviceTask::PATH_TRACE) {
106                         RenderTile tile;
107
108                         /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
109                          * fetch its size.
110                          */
111                         typedef struct KernelGlobals {
112                                 ccl_constant KernelData *data;
113 #define KERNEL_TEX(type, ttype, name) \
114                                 ccl_global type *name;
115 #include "kernel_textures.h"
116 #undef KERNEL_TEX
117                                 SplitData split_data;
118                                 SplitParams split_param_data;
119                         } KernelGlobals;
120
121                         /* Allocate buffer for kernel globals */
122                         device_memory kgbuffer;
123                         kgbuffer.resize(sizeof(KernelGlobals));
124                         mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
125
126                         /* Keep rendering tiles until done. */
127                         while(task->acquire_tile(this, tile)) {
128                                 split_kernel->path_trace(task,
129                                                          tile,
130                                                          kgbuffer,
131                                                          *const_mem_map["__data"]);
132
133                                 /* Complete kernel execution before release tile. */
134                                 /* This helps in multi-device render;
135                                  * The device that reaches the critical-section function
136                                  * release_tile waits (stalling other devices from entering
137                                  * release_tile) for all kernels to complete. If device1 (a
138                                  * slow-render device) reaches release_tile first then it would
139                                  * stall device2 (a fast-render device) from proceeding to render
140                                  * next tile.
141                                  */
142                                 clFinish(cqCommandQueue);
143
144                                 task->release_tile(tile);
145                         }
146
147                         mem_free(kgbuffer);
148                 }
149         }
150
151 protected:
152         /* ** Those guys are for workign around some compiler-specific bugs ** */
153
154         string build_options_for_base_program(
155                 const DeviceRequestedFeatures& requested_features)
156         {
157                 return requested_features.get_build_options();
158         }
159
160         friend class OpenCLSplitKernel;
161         friend class OpenCLSplitKernelFunction;
162 };
163
164 class OpenCLSplitKernelFunction : public SplitKernelFunction {
165 public:
166         OpenCLDeviceSplitKernel* device;
167         OpenCLDeviceBase::OpenCLProgram program;
168
169         OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
170         ~OpenCLSplitKernelFunction() { program.release(); }
171
172         virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
173         {
174                 device->kernel_set_args(program(), 0, kg, data);
175
176                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
177                                                        program(),
178                                                        2,
179                                                        NULL,
180                                                        dim.global_size,
181                                                        dim.local_size,
182                                                        0,
183                                                        NULL,
184                                                        NULL);
185
186                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
187
188                 if(device->ciErr != CL_SUCCESS) {
189                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
190                                                        clewErrorString(device->ciErr));
191                         device->opencl_error(message);
192                         return false;
193                 }
194
195                 return true;
196         }
197 };
198
199 class OpenCLSplitKernel : public DeviceSplitKernel {
200         OpenCLDeviceSplitKernel *device;
201 public:
202         explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
203         }
204
205         virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
206                                                                const DeviceRequestedFeatures& requested_features)
207         {
208                 OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
209
210                 kernel->program = OpenCLDeviceBase::OpenCLProgram(device,
211                                                 "split_" + kernel_name,
212                                                 "kernel_" + kernel_name + ".cl",
213                                                 get_build_options(device, requested_features));
214                 kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
215                 kernel->program.load();
216
217                 if(!kernel->program.is_loaded()) {
218                         delete kernel;
219                         return NULL;
220                 }
221
222                 return kernel;
223         }
224
225         virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
226         {
227                 device_vector<uint> size_buffer;
228                 size_buffer.resize(1);
229                 device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
230
231                 uint threads = num_threads;
232                 device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
233
234                 size_t global_size = 64;
235                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
236                                                device->program_state_buffer_size(),
237                                                1,
238                                                NULL,
239                                                &global_size,
240                                                NULL,
241                                                0,
242                                                NULL,
243                                                NULL);
244
245                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
246
247                 device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
248                 device->mem_free(size_buffer);
249
250                 if(device->ciErr != CL_SUCCESS) {
251                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
252                                                        clewErrorString(device->ciErr));
253                         device->opencl_error(message);
254                         return 0;
255                 }
256
257                 return *size_buffer.get_data();
258         }
259
260         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
261                                                     RenderTile& rtile,
262                                                     int num_global_elements,
263                                                     device_memory& kernel_globals,
264                                                     device_memory& kernel_data,
265                                                     device_memory& split_data,
266                                                     device_memory& ray_state,
267                                                     device_memory& queue_index,
268                                                     device_memory& use_queues_flag,
269                                                     device_memory& work_pool_wgs
270                                                     )
271         {
272                 cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
273
274                 /* Set the range of samples to be processed for every ray in
275                  * path-regeneration logic.
276                  */
277                 cl_int start_sample = rtile.start_sample;
278                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
279
280                 cl_uint start_arg_index =
281                         device->kernel_set_args(device->program_data_init(),
282                                         0,
283                                         kernel_globals,
284                                         kernel_data,
285                                         split_data,
286                                         num_global_elements,
287                                         ray_state,
288                                         rtile.rng_state);
289
290 /* TODO(sergey): Avoid map lookup here. */
291 #define KERNEL_TEX(type, ttype, name) \
292         device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
293 #include "kernel_textures.h"
294 #undef KERNEL_TEX
295
296                 start_arg_index +=
297                         device->kernel_set_args(device->program_data_init(),
298                                         start_arg_index,
299                                         start_sample,
300                                         end_sample,
301                                         rtile.x,
302                                         rtile.y,
303                                         rtile.w,
304                                         rtile.h,
305                                         rtile.offset,
306                                         rtile.stride,
307                                         queue_index,
308                                         dQueue_size,
309                                         use_queues_flag,
310                                         work_pool_wgs,
311                                         rtile.num_samples,
312                                         rtile.buffer);
313
314                 /* Enqueue ckPathTraceKernel_data_init kernel. */
315                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
316                                                device->program_data_init(),
317                                                2,
318                                                NULL,
319                                                dim.global_size,
320                                                dim.local_size,
321                                                0,
322                                                NULL,
323                                                NULL);
324
325                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
326
327                 if(device->ciErr != CL_SUCCESS) {
328                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
329                                                        clewErrorString(device->ciErr));
330                         device->opencl_error(message);
331                         return false;
332                 }
333
334                 return true;
335         }
336
337         virtual int2 split_kernel_local_size()
338         {
339                 return make_int2(64, 1);
340         }
341
342         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
343         {
344                 size_t max_buffer_size;
345                 clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
346                 VLOG(1) << "Maximum device allocation side: "
347                         << string_human_readable_number(max_buffer_size) << " bytes. ("
348                         << string_human_readable_size(max_buffer_size) << ").";
349
350                 size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
351                 int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
352                 VLOG(1) << "Global size: " << global_size << ".";
353                 return global_size;
354         }
355 };
356
357 OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
358 : OpenCLDeviceBase(info, stats, background_)
359 {
360         split_kernel = new OpenCLSplitKernel(this);
361
362         background = background_;
363 }
364
365 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
366 {
367         return new OpenCLDeviceSplitKernel(info, stats, background);
368 }
369
370 CCL_NAMESPACE_END
371
372 #endif /* WITH_OPENCL */