Cycles: Split path initialization into own kernel
[blender-staging.git] / intern / cycles / device / device_split_kernel.cpp
1 /*
2  * Copyright 2011-2016 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "device_split_kernel.h"
18
19 #include "kernel_types.h"
20 #include "kernel_split_data.h"
21
22 #include "util_time.h"
23
24 CCL_NAMESPACE_BEGIN
25
26 static const double alpha = 0.1; /* alpha for rolling average */
27
28 DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
29 {
30         current_max_closure = -1;
31         first_tile = true;
32
33         avg_time_per_sample = 0.0;
34 }
35
36 DeviceSplitKernel::~DeviceSplitKernel()
37 {
38         device->mem_free(split_data);
39         device->mem_free(ray_state);
40         device->mem_free(use_queues_flag);
41         device->mem_free(queue_index);
42         device->mem_free(work_pool_wgs);
43
44         delete kernel_path_init;
45         delete kernel_scene_intersect;
46         delete kernel_lamp_emission;
47         delete kernel_queue_enqueue;
48         delete kernel_background_buffer_update;
49         delete kernel_shader_eval;
50         delete kernel_holdout_emission_blurring_pathtermination_ao;
51         delete kernel_direct_lighting;
52         delete kernel_shadow_blocked;
53         delete kernel_next_iteration_setup;
54         delete kernel_sum_all_radiance;
55 }
56
57 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
58 {
59 #define LOAD_KERNEL(name) \
60                 kernel_##name = get_split_kernel_function(#name, requested_features); \
61                 if(!kernel_##name) { \
62                         return false; \
63                 }
64
65         LOAD_KERNEL(path_init);
66         LOAD_KERNEL(scene_intersect);
67         LOAD_KERNEL(lamp_emission);
68         LOAD_KERNEL(queue_enqueue);
69         LOAD_KERNEL(background_buffer_update);
70         LOAD_KERNEL(shader_eval);
71         LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
72         LOAD_KERNEL(direct_lighting);
73         LOAD_KERNEL(shadow_blocked);
74         LOAD_KERNEL(next_iteration_setup);
75         LOAD_KERNEL(sum_all_radiance);
76
77 #undef LOAD_KERNEL
78
79         current_max_closure = requested_features.max_closure;
80
81         return true;
82 }
83
84 size_t DeviceSplitKernel::max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size)
85 {
86         size_t size_per_element = split_data_buffer_size(1024, current_max_closure, passes_size) / 1024;
87         return max_buffer_size / size_per_element;
88 }
89
90 bool DeviceSplitKernel::path_trace(DeviceTask *task,
91                                    RenderTile& tile,
92                                    device_memory& kgbuffer,
93                                    device_memory& kernel_data)
94 {
95         if(device->have_error()) {
96                 return false;
97         }
98
99         /* Get local size */
100         size_t local_size[2];
101         {
102                 int2 lsize = split_kernel_local_size();
103                 local_size[0] = lsize[0];
104                 local_size[1] = lsize[1];
105         }
106
107         /* Calculate per_thread_output_buffer_size. */
108         size_t per_thread_output_buffer_size = task->passes_size;
109
110         /* Set gloabl size */
111         size_t global_size[2];
112         {
113                 int2 gsize = split_kernel_global_size(task);
114
115                 /* Make sure that set work size is a multiple of local
116                  * work size dimensions.
117                  */
118                 global_size[0] = round_up(gsize[0], local_size[0]);
119                 global_size[1] = round_up(gsize[1], local_size[1]);
120         }
121
122         /* Number of elements in the global state buffer */
123         int num_global_elements = global_size[0] * global_size[1];
124
125         /* Allocate all required global memory once. */
126         if(first_tile) {
127                 first_tile = false;
128
129                 /* Calculate max groups */
130
131                 /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
132                 unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1;
133
134                 /* Allocate work_pool_wgs memory. */
135                 work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
136                 device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
137
138                 queue_index.resize(NUM_QUEUES * sizeof(int));
139                 device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
140
141                 use_queues_flag.resize(sizeof(char));
142                 device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
143
144                 ray_state.resize(num_global_elements);
145                 device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
146
147                 split_data.resize(split_data_buffer_size(num_global_elements,
148                                                          current_max_closure,
149                                                          per_thread_output_buffer_size));
150                 device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
151         }
152
153 #define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
154                 if(device->have_error()) { \
155                         return false; \
156                 } \
157                 if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
158                         return false; \
159                 }
160
161         tile.sample = tile.start_sample;
162
163         /* for exponential increase between tile updates */
164         int time_multiplier = 1;
165
166         while(tile.sample < tile.start_sample + tile.num_samples) {
167                 /* to keep track of how long it takes to run a number of samples */
168                 double start_time = time_dt();
169
170                 /* initial guess to start rolling average */
171                 const int initial_num_samples = 1;
172                 /* approx number of samples per second */
173                 int samples_per_second = (avg_time_per_sample > 0.0) ?
174                                          int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
175
176                 RenderTile subtile = tile;
177                 subtile.start_sample = tile.sample;
178                 subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
179
180                 if(device->have_error()) {
181                         return false;
182                 }
183
184                 /* reset state memory here as global size for data_init
185                  * kernel might not be large enough to do in kernel
186                  */
187                 device->mem_zero(work_pool_wgs);
188                 device->mem_zero(split_data);
189
190                 if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
191                                                            subtile,
192                                                            num_global_elements,
193                                                            kgbuffer,
194                                                            kernel_data,
195                                                            split_data,
196                                                            ray_state,
197                                                            queue_index,
198                                                            use_queues_flag,
199                                                            work_pool_wgs
200                                                            ))
201                 {
202                         return false;
203                 }
204
205                 ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
206
207                 bool activeRaysAvailable = true;
208
209                 while(activeRaysAvailable) {
210                         /* Twice the global work size of other kernels for
211                          * ckPathTraceKernel_shadow_blocked_direct_lighting. */
212                         size_t global_size_shadow_blocked[2];
213                         global_size_shadow_blocked[0] = global_size[0] * 2;
214                         global_size_shadow_blocked[1] = global_size[1];
215
216                         /* Do path-iteration in host [Enqueue Path-iteration kernels. */
217                         for(int PathIter = 0; PathIter < 16; PathIter++) {
218                                 ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
219                                 ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
220                                 ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
221                                 ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
222                                 ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
223                                 ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
224                                 ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
225                                 ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
226                                 ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
227
228                                 if(task->get_cancel()) {
229                                         return true;
230                                 }
231                         }
232
233                         /* Decide if we should exit path-iteration in host. */
234                         device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
235
236                         activeRaysAvailable = false;
237
238                         for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
239                                 if(int8_t(ray_state.get_data()[rayStateIter]) != RAY_INACTIVE) {
240                                         /* Not all rays are RAY_INACTIVE. */
241                                         activeRaysAvailable = true;
242                                         break;
243                                 }
244                         }
245
246                         if(task->get_cancel()) {
247                                 return true;
248                         }
249                 }
250
251                 double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
252
253                 if(avg_time_per_sample == 0.0) {
254                         /* start rolling average */
255                         avg_time_per_sample = time_per_sample;
256                 }
257                 else {
258                         avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
259                 }
260
261                 size_t sum_all_radiance_local_size[2] = {16, 16};
262                 size_t sum_all_radiance_global_size[2];
263                 sum_all_radiance_global_size[0] = round_up(tile.w, sum_all_radiance_local_size[0]);
264                 sum_all_radiance_global_size[1] = round_up(tile.h, sum_all_radiance_local_size[1]);
265
266                 ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
267                                      sum_all_radiance_global_size,
268                                      sum_all_radiance_local_size);
269
270 #undef ENQUEUE_SPLIT_KERNEL
271
272                 tile.sample += subtile.num_samples;
273                 task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
274
275                 time_multiplier = min(time_multiplier << 1, 10);
276
277                 if(task->get_cancel()) {
278                         return true;
279                 }
280         }
281
282         return true;
283 }
284
285 CCL_NAMESPACE_END
286
287