e70c3e0d78cb64e4f4051f898bd7b41056f32922
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util/util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device/device.h"
28 #include "device/device_denoising.h"
29 #include "device/device_intern.h"
30 #include "device/device_split_kernel.h"
31
32 #include "kernel/kernel.h"
33 #include "kernel/kernel_compat_cpu.h"
34 #include "kernel/kernel_types.h"
35 #include "kernel/split/kernel_split_data.h"
36 #include "kernel/kernel_globals.h"
37
38 #include "kernel/filter/filter.h"
39
40 #include "kernel/osl/osl_shader.h"
41 #include "kernel/osl/osl_globals.h"
42
43 #include "render/buffers.h"
44
45 #include "util/util_debug.h"
46 #include "util/util_foreach.h"
47 #include "util/util_function.h"
48 #include "util/util_logging.h"
49 #include "util/util_map.h"
50 #include "util/util_opengl.h"
51 #include "util/util_optimization.h"
52 #include "util/util_progress.h"
53 #include "util/util_system.h"
54 #include "util/util_thread.h"
55
56 #include "render/coverage.h"
57
58 CCL_NAMESPACE_BEGIN
59
60 class CPUDevice;
61
62 /* Has to be outside of the class to be shared across template instantiations. */
63 static const char *logged_architecture = "";
64
65 template<typename F>
66 class KernelFunctions {
67 public:
68         KernelFunctions()
69         {
70                 kernel = (F)NULL;
71         }
72
73         KernelFunctions(F kernel_default,
74                         F kernel_sse2,
75                         F kernel_sse3,
76                         F kernel_sse41,
77                         F kernel_avx,
78                         F kernel_avx2)
79         {
80                 const char *architecture_name = "default";
81                 kernel = kernel_default;
82
83                 /* Silence potential warnings about unused variables
84                  * when compiling without some architectures. */
85                 (void)kernel_sse2;
86                 (void)kernel_sse3;
87                 (void)kernel_sse41;
88                 (void)kernel_avx;
89                 (void)kernel_avx2;
90 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
91                 if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
92                         architecture_name = "AVX2";
93                         kernel = kernel_avx2;
94                 }
95                 else
96 #endif
97 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
98                 if(DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
99                         architecture_name = "AVX";
100                         kernel = kernel_avx;
101                 }
102                 else
103 #endif
104 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
105                 if(DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
106                         architecture_name = "SSE4.1";
107                         kernel = kernel_sse41;
108                 }
109                 else
110 #endif
111 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
112                 if(DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
113                         architecture_name = "SSE3";
114                         kernel = kernel_sse3;
115                 }
116                 else
117 #endif
118 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
119                 if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
120                         architecture_name = "SSE2";
121                         kernel = kernel_sse2;
122                 }
123 #endif
124
125                 if(strcmp(architecture_name, logged_architecture) != 0) {
126                         VLOG(1) << "Will be using " << architecture_name << " kernels.";
127                         logged_architecture = architecture_name;
128                 }
129         }
130
131         inline F operator()() const {
132                 assert(kernel);
133                 return kernel;
134         }
135 protected:
136         F kernel;
137 };
138
139 class CPUSplitKernel : public DeviceSplitKernel {
140         CPUDevice *device;
141 public:
142         explicit CPUSplitKernel(CPUDevice *device);
143
144         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
145                                                     RenderTile& rtile,
146                                                     int num_global_elements,
147                                                     device_memory& kernel_globals,
148                                                     device_memory& kernel_data_,
149                                                     device_memory& split_data,
150                                                     device_memory& ray_state,
151                                                     device_memory& queue_index,
152                                                     device_memory& use_queues_flag,
153                                                     device_memory& work_pool_wgs);
154
155         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
156                                                                const DeviceRequestedFeatures&);
157         virtual int2 split_kernel_local_size();
158         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
159         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
160 };
161
162 class CPUDevice : public Device
163 {
164 public:
165         TaskPool task_pool;
166         KernelGlobals kernel_globals;
167
168         device_vector<TextureInfo> texture_info;
169         bool need_texture_info;
170
171 #ifdef WITH_OSL
172         OSLGlobals osl_globals;
173 #endif
174
175         bool use_split_kernel;
176
177         DeviceRequestedFeatures requested_features;
178
179         KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)>             path_trace_kernel;
180         KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
181         KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
182         KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>   shader_kernel;
183
184         KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
185         KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, int*, int, int)>               filter_get_feature_kernel;
186         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
187         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
188
189         KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
190         KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
191         KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
192         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
193         KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
194
195         KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
196         KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
197         KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
198
199         KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
200                                int, int, int, int, int, int, int, int, ccl_global int*, int,
201                                ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)>        data_init_kernel;
202         unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
203
204 #define KERNEL_FUNCTIONS(name) \
205               KERNEL_NAME_EVAL(cpu, name), \
206               KERNEL_NAME_EVAL(cpu_sse2, name), \
207               KERNEL_NAME_EVAL(cpu_sse3, name), \
208               KERNEL_NAME_EVAL(cpu_sse41, name), \
209               KERNEL_NAME_EVAL(cpu_avx, name), \
210               KERNEL_NAME_EVAL(cpu_avx2, name)
211
212         CPUDevice(DeviceInfo& info_, Stats &stats_, bool background_)
213         : Device(info_, stats_, background_),
214           texture_info(this, "__texture_info", MEM_TEXTURE),
215 #define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
216           REGISTER_KERNEL(path_trace),
217           REGISTER_KERNEL(convert_to_half_float),
218           REGISTER_KERNEL(convert_to_byte),
219           REGISTER_KERNEL(shader),
220           REGISTER_KERNEL(filter_divide_shadow),
221           REGISTER_KERNEL(filter_get_feature),
222           REGISTER_KERNEL(filter_detect_outliers),
223           REGISTER_KERNEL(filter_combine_halves),
224           REGISTER_KERNEL(filter_nlm_calc_difference),
225           REGISTER_KERNEL(filter_nlm_blur),
226           REGISTER_KERNEL(filter_nlm_calc_weight),
227           REGISTER_KERNEL(filter_nlm_update_output),
228           REGISTER_KERNEL(filter_nlm_normalize),
229           REGISTER_KERNEL(filter_construct_transform),
230           REGISTER_KERNEL(filter_nlm_construct_gramian),
231           REGISTER_KERNEL(filter_finalize),
232           REGISTER_KERNEL(data_init)
233 #undef REGISTER_KERNEL
234         {
235                 if(info.cpu_threads == 0) {
236                         info.cpu_threads = TaskScheduler::num_threads();
237                 }
238
239 #ifdef WITH_OSL
240                 kernel_globals.osl = &osl_globals;
241 #endif
242                 use_split_kernel = DebugFlags().cpu.split_kernel;
243                 if(use_split_kernel) {
244                         VLOG(1) << "Will be using split kernel.";
245                 }
246                 need_texture_info = false;
247
248 #define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
249                 REGISTER_SPLIT_KERNEL(path_init);
250                 REGISTER_SPLIT_KERNEL(scene_intersect);
251                 REGISTER_SPLIT_KERNEL(lamp_emission);
252                 REGISTER_SPLIT_KERNEL(do_volume);
253                 REGISTER_SPLIT_KERNEL(queue_enqueue);
254                 REGISTER_SPLIT_KERNEL(indirect_background);
255                 REGISTER_SPLIT_KERNEL(shader_setup);
256                 REGISTER_SPLIT_KERNEL(shader_sort);
257                 REGISTER_SPLIT_KERNEL(shader_eval);
258                 REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
259                 REGISTER_SPLIT_KERNEL(subsurface_scatter);
260                 REGISTER_SPLIT_KERNEL(direct_lighting);
261                 REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
262                 REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
263                 REGISTER_SPLIT_KERNEL(enqueue_inactive);
264                 REGISTER_SPLIT_KERNEL(next_iteration_setup);
265                 REGISTER_SPLIT_KERNEL(indirect_subsurface);
266                 REGISTER_SPLIT_KERNEL(buffer_update);
267 #undef REGISTER_SPLIT_KERNEL
268 #undef KERNEL_FUNCTIONS
269         }
270
271         ~CPUDevice()
272         {
273                 task_pool.stop();
274                 texture_info.free();
275         }
276
277         virtual bool show_samples() const
278         {
279                 return (info.cpu_threads == 1);
280         }
281
282         void load_texture_info()
283         {
284                 if(need_texture_info) {
285                         texture_info.copy_to_device();
286                         need_texture_info = false;
287                 }
288         }
289
290         void mem_alloc(device_memory& mem)
291         {
292                 if(mem.type == MEM_TEXTURE) {
293                         assert(!"mem_alloc not supported for textures.");
294                 }
295                 else {
296                         if(mem.name) {
297                                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
298                                                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
299                                                 << string_human_readable_size(mem.memory_size()) << ")";
300                         }
301
302                         if(mem.type == MEM_DEVICE_ONLY) {
303                                 assert(!mem.host_pointer);
304                                 size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
305                                 void *data = util_aligned_malloc(mem.memory_size(), alignment);
306                                 mem.device_pointer = (device_ptr)data;
307                         }
308                         else {
309                                 mem.device_pointer = (device_ptr)mem.host_pointer;
310                         }
311
312                         mem.device_size = mem.memory_size();
313                         stats.mem_alloc(mem.device_size);
314                 }
315         }
316
317         void mem_copy_to(device_memory& mem)
318         {
319                 if(mem.type == MEM_TEXTURE) {
320                         tex_free(mem);
321                         tex_alloc(mem);
322                 }
323                 else if(mem.type == MEM_PIXELS) {
324                         assert(!"mem_copy_to not supported for pixels.");
325                 }
326                 else {
327                         if(!mem.device_pointer) {
328                                 mem_alloc(mem);
329                         }
330
331                         /* copy is no-op */
332                 }
333         }
334
335         void mem_copy_from(device_memory& /*mem*/,
336                            int /*y*/, int /*w*/, int /*h*/,
337                            int /*elem*/)
338         {
339                 /* no-op */
340         }
341
342         void mem_zero(device_memory& mem)
343         {
344                 if(!mem.device_pointer) {
345                         mem_alloc(mem);
346                 }
347
348                 if(mem.device_pointer) {
349                         memset((void*)mem.device_pointer, 0, mem.memory_size());
350                 }
351         }
352
353         void mem_free(device_memory& mem)
354         {
355                 if(mem.type == MEM_TEXTURE) {
356                         tex_free(mem);
357                 }
358                 else if(mem.device_pointer) {
359                         if(mem.type == MEM_DEVICE_ONLY) {
360                                 util_aligned_free((void*)mem.device_pointer);
361                         }
362                         mem.device_pointer = 0;
363                         stats.mem_free(mem.device_size);
364                         mem.device_size = 0;
365                 }
366         }
367
368         virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
369         {
370                 return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
371         }
372
373         void const_copy_to(const char *name, void *host, size_t size)
374         {
375                 kernel_const_copy(&kernel_globals, name, host, size);
376         }
377
378         void tex_alloc(device_memory& mem)
379         {
380                 VLOG(1) << "Texture allocate: " << mem.name << ", "
381                         << string_human_readable_number(mem.memory_size()) << " bytes. ("
382                         << string_human_readable_size(mem.memory_size()) << ")";
383
384                 if(mem.interpolation == INTERPOLATION_NONE) {
385                         /* Data texture. */
386                         kernel_tex_copy(&kernel_globals,
387                                                         mem.name,
388                                                         mem.host_pointer,
389                                                         mem.data_size);
390                 }
391                 else {
392                         /* Image Texture. */
393                         int flat_slot = 0;
394                         if(string_startswith(mem.name, "__tex_image")) {
395                                 int pos =  string(mem.name).rfind("_");
396                                 flat_slot = atoi(mem.name + pos + 1);
397                         }
398                         else {
399                                 assert(0);
400                         }
401
402                         if(flat_slot >= texture_info.size()) {
403                                 /* Allocate some slots in advance, to reduce amount
404                                  * of re-allocations. */
405                                 texture_info.resize(flat_slot + 128);
406                         }
407
408                         TextureInfo& info = texture_info[flat_slot];
409                         info.data = (uint64_t)mem.host_pointer;
410                         info.cl_buffer = 0;
411                         info.interpolation = mem.interpolation;
412                         info.extension = mem.extension;
413                         info.width = mem.data_width;
414                         info.height = mem.data_height;
415                         info.depth = mem.data_depth;
416
417                         need_texture_info = true;
418                 }
419
420                 mem.device_pointer = (device_ptr)mem.host_pointer;
421                 mem.device_size = mem.memory_size();
422                 stats.mem_alloc(mem.device_size);
423         }
424
425         void tex_free(device_memory& mem)
426         {
427                 if(mem.device_pointer) {
428                         mem.device_pointer = 0;
429                         stats.mem_free(mem.device_size);
430                         mem.device_size = 0;
431                         need_texture_info = true;
432                 }
433         }
434
435         void *osl_memory()
436         {
437 #ifdef WITH_OSL
438                 return &osl_globals;
439 #else
440                 return NULL;
441 #endif
442         }
443
444         void thread_run(DeviceTask *task)
445         {
446                 if(task->type == DeviceTask::RENDER) {
447                         thread_render(*task);
448                 }
449                 else if(task->type == DeviceTask::FILM_CONVERT)
450                         thread_film_convert(*task);
451                 else if(task->type == DeviceTask::SHADER)
452                         thread_shader(*task);
453         }
454
455         class CPUDeviceTask : public DeviceTask {
456         public:
457                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
458                 : DeviceTask(task)
459                 {
460                         run = function_bind(&CPUDevice::thread_run, device, this);
461                 }
462         };
463
464         bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
465                                        DenoisingTask *task)
466         {
467                 int4 rect = task->rect;
468                 int   r   = task->nlm_state.r;
469                 int   f   = task->nlm_state.f;
470                 float a   = task->nlm_state.a;
471                 float k_2 = task->nlm_state.k_2;
472
473                 int w = align_up(rect.z-rect.x, 4);
474                 int h = rect.w-rect.y;
475
476                 float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
477                 float *difference     = (float*) task->nlm_state.temporary_2_ptr;
478                 float *weightAccum    = (float*) task->nlm_state.temporary_3_ptr;
479
480                 memset(weightAccum, 0, sizeof(float)*w*h);
481                 memset((float*) out_ptr, 0, sizeof(float)*w*h);
482
483                 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
484                         int dy = i / (2*r+1) - r;
485                         int dx = i % (2*r+1) - r;
486
487                         int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
488                         filter_nlm_calc_difference_kernel()(dx, dy,
489                                                             (float*) guide_ptr,
490                                                             (float*) variance_ptr,
491                                                             difference,
492                                                             local_rect,
493                                                             w, 0,
494                                                             a, k_2);
495
496                         filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
497                         filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
498                         filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
499
500                         filter_nlm_update_output_kernel()(dx, dy,
501                                                           blurDifference,
502                                                           (float*) image_ptr,
503                                                           (float*) out_ptr,
504                                                           weightAccum,
505                                                           local_rect,
506                                                           w, f);
507                 }
508
509                 int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
510                 filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
511
512                 return true;
513         }
514
515         bool denoising_construct_transform(DenoisingTask *task)
516         {
517                 for(int y = 0; y < task->filter_area.w; y++) {
518                         for(int x = 0; x < task->filter_area.z; x++) {
519                                 filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
520                                                                     x + task->filter_area.x,
521                                                                     y + task->filter_area.y,
522                                                                     y*task->filter_area.z + x,
523                                                                     (float*) task->storage.transform.device_pointer,
524                                                                     (int*)   task->storage.rank.device_pointer,
525                                                                     &task->rect.x,
526                                                                     task->buffer.pass_stride,
527                                                                     task->radius,
528                                                                     task->pca_threshold);
529                         }
530                 }
531                 return true;
532         }
533
534         bool denoising_reconstruct(device_ptr color_ptr,
535                                    device_ptr color_variance_ptr,
536                                    device_ptr output_ptr,
537                                    DenoisingTask *task)
538         {
539                 mem_zero(task->storage.XtWX);
540                 mem_zero(task->storage.XtWY);
541
542                 float *difference     = (float*) task->reconstruction_state.temporary_1_ptr;
543                 float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
544
545                 int r = task->radius;
546                 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
547                         int dy = i / (2*r+1) - r;
548                         int dx = i % (2*r+1) - r;
549
550                         int local_rect[4] = {max(0, -dx), max(0, -dy),
551                                              task->reconstruction_state.source_w - max(0, dx),
552                                              task->reconstruction_state.source_h - max(0, dy)};
553                         filter_nlm_calc_difference_kernel()(dx, dy,
554                                                             (float*) color_ptr,
555                                                             (float*) color_variance_ptr,
556                                                             difference,
557                                                             local_rect,
558                                                             task->buffer.stride,
559                                                             task->buffer.pass_stride,
560                                                             1.0f,
561                                                             task->nlm_k_2);
562                         filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
563                         filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
564                         filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
565                         filter_nlm_construct_gramian_kernel()(dx, dy,
566                                                               blurDifference,
567                                                               (float*)  task->buffer.mem.device_pointer,
568                                                               (float*)  task->storage.transform.device_pointer,
569                                                               (int*)    task->storage.rank.device_pointer,
570                                                               (float*)  task->storage.XtWX.device_pointer,
571                                                               (float3*) task->storage.XtWY.device_pointer,
572                                                               local_rect,
573                                                               &task->reconstruction_state.filter_window.x,
574                                                               task->buffer.stride,
575                                                               4,
576                                                               task->buffer.pass_stride);
577                 }
578                 for(int y = 0; y < task->filter_area.w; y++) {
579                         for(int x = 0; x < task->filter_area.z; x++) {
580                                 filter_finalize_kernel()(x,
581                                                          y,
582                                                          y*task->filter_area.z + x,
583                                                          (float*)  output_ptr,
584                                                          (int*)    task->storage.rank.device_pointer,
585                                                          (float*)  task->storage.XtWX.device_pointer,
586                                                          (float3*) task->storage.XtWY.device_pointer,
587                                                          &task->reconstruction_state.buffer_params.x,
588                                                          task->render_buffer.samples);
589                         }
590                 }
591                 return true;
592         }
593
594         bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
595                                       device_ptr mean_ptr, device_ptr variance_ptr,
596                                       int r, int4 rect, DenoisingTask * /*task*/)
597         {
598                 for(int y = rect.y; y < rect.w; y++) {
599                         for(int x = rect.x; x < rect.z; x++) {
600                                 filter_combine_halves_kernel()(x, y,
601                                                                (float*) mean_ptr,
602                                                                (float*) variance_ptr,
603                                                                (float*) a_ptr,
604                                                                (float*) b_ptr,
605                                                                &rect.x,
606                                                                r);
607                         }
608                 }
609                 return true;
610         }
611
612         bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
613                                      device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
614                                      device_ptr buffer_variance_ptr, DenoisingTask *task)
615         {
616                 for(int y = task->rect.y; y < task->rect.w; y++) {
617                         for(int x = task->rect.x; x < task->rect.z; x++) {
618                                 filter_divide_shadow_kernel()(task->render_buffer.samples,
619                                                               task->tile_info,
620                                                               x, y,
621                                                               (float*) a_ptr,
622                                                               (float*) b_ptr,
623                                                               (float*) sample_variance_ptr,
624                                                               (float*) sv_variance_ptr,
625                                                               (float*) buffer_variance_ptr,
626                                                               &task->rect.x,
627                                                               task->render_buffer.pass_stride,
628                                                               task->render_buffer.offset);
629                         }
630                 }
631                 return true;
632         }
633
634         bool denoising_get_feature(int mean_offset,
635                                    int variance_offset,
636                                    device_ptr mean_ptr,
637                                    device_ptr variance_ptr,
638                                    DenoisingTask *task)
639         {
640                 for(int y = task->rect.y; y < task->rect.w; y++) {
641                         for(int x = task->rect.x; x < task->rect.z; x++) {
642                                 filter_get_feature_kernel()(task->render_buffer.samples,
643                                                             task->tile_info,
644                                                             mean_offset,
645                                                             variance_offset,
646                                                             x, y,
647                                                             (float*) mean_ptr,
648                                                             (float*) variance_ptr,
649                                                             &task->rect.x,
650                                                             task->render_buffer.pass_stride,
651                                                             task->render_buffer.offset);
652                         }
653                 }
654                 return true;
655         }
656
657         bool denoising_detect_outliers(device_ptr image_ptr,
658                                        device_ptr variance_ptr,
659                                        device_ptr depth_ptr,
660                                        device_ptr output_ptr,
661                                        DenoisingTask *task)
662         {
663                 for(int y = task->rect.y; y < task->rect.w; y++) {
664                         for(int x = task->rect.x; x < task->rect.z; x++) {
665                                 filter_detect_outliers_kernel()(x, y,
666                                                                 (float*) image_ptr,
667                                                                 (float*) variance_ptr,
668                                                                 (float*) depth_ptr,
669                                                                 (float*) output_ptr,
670                                                                 &task->rect.x,
671                                                                 task->buffer.pass_stride);
672                         }
673                 }
674                 return true;
675         }
676
677         void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg, vector<map<float, float> >& coverage_object, vector<map<float, float> >& coverage_material, vector<map<float, float > >& coverage_asset)
678         {
679                 scoped_timer timer(&tile.buffers->render_time);
680                 kg->coverage_object = kg->coverage_material = NULL;
681
682                 if(kg->__data.film.use_cryptomatte & CRYPT_ACCURATE) {
683                         if(kg->__data.film.use_cryptomatte & CRYPT_OBJECT) {
684                                 coverage_object.clear();
685                                 coverage_object.resize(tile.w * tile.h);
686                         }
687                         if(kg->__data.film.use_cryptomatte & CRYPT_MATERIAL) {
688                                 coverage_material.clear();
689                                 coverage_material.resize(tile.w * tile.h);
690                         }
691                         if(kg->__data.film.use_cryptomatte & CRYPT_ASSET) {
692                                 coverage_asset.clear();
693                                 coverage_asset.resize(tile.w * tile.h);
694                         }
695                 }
696
697                 float *render_buffer = (float*)tile.buffer;
698                 int start_sample = tile.start_sample;
699                 int end_sample = tile.start_sample + tile.num_samples;
700
701                 for(int sample = start_sample; sample < end_sample; sample++) {
702                         if(task.get_cancel() || task_pool.canceled()) {
703                                 if(task.need_finish_queue == false)
704                                         break;
705                         }
706
707                         for(int y = tile.y; y < tile.y + tile.h; y++) {
708                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
709                                         if(kg->__data.film.use_cryptomatte & CRYPT_ACCURATE) {
710                                                 if(kg->__data.film.use_cryptomatte & CRYPT_OBJECT) {
711                                                         kg->coverage_object = &coverage_object[tile.w * (y - tile.y) + x - tile.x];
712                                                 }
713                                                 if(kg->__data.film.use_cryptomatte & CRYPT_MATERIAL) {
714                                                         kg->coverage_material = &coverage_material[tile.w * (y - tile.y) + x - tile.x];
715                                                 }
716                                                 if(kg->__data.film.use_cryptomatte & CRYPT_ASSET) {
717                                                         kg->coverage_asset = &coverage_asset[tile.w * (y - tile.y) + x - tile.x];
718                                                 }
719                                         }
720                                         path_trace_kernel()(kg, render_buffer,
721                                                             sample, x, y, tile.offset, tile.stride);
722                                 }
723                         }
724
725                         tile.sample = sample + 1;
726
727                         task.update_progress(&tile, tile.w*tile.h);
728                 }
729         }
730
731         void denoise(DenoisingTask& denoising, RenderTile &tile)
732         {
733                 tile.sample = tile.start_sample + tile.num_samples;
734
735                 denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
736                 denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
737                 denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
738                 denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
739                 denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
740                 denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
741                 denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
742
743                 denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
744                 denoising.render_buffer.samples = tile.sample;
745
746                 denoising.run_denoising(&tile);
747         }
748
749         void thread_render(DeviceTask& task)
750         {
751                 if(task_pool.canceled()) {
752                         if(task.need_finish_queue == false)
753                                 return;
754                 }
755
756                 /* allocate buffer for kernel globals */
757                 device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
758                 kgbuffer.alloc_to_device(1);
759
760                 KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
761
762                 CPUSplitKernel *split_kernel = NULL;
763                 if(use_split_kernel) {
764                         split_kernel = new CPUSplitKernel(this);
765                         if(!split_kernel->load_kernels(requested_features)) {
766                                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
767                                 kgbuffer.free();
768                                 delete split_kernel;
769                                 return;
770                         }
771                 }
772
773                 RenderTile tile;
774                 DenoisingTask denoising(this, task);
775
776                 while(task.acquire_tile(this, tile)) {
777                         if(tile.task == RenderTile::PATH_TRACE) {
778                                 /* cryptomatte data. This needs a better place than here. */
779                                 vector<map<float, float> >coverage_object;
780                                 vector<map<float, float> >coverage_material;
781                                 vector<map<float, float> >coverage_asset;
782
783                                 if(use_split_kernel) {
784                                         device_only_memory<uchar> void_buffer(this, "void_buffer");
785                                         split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
786                                 }
787                                 else {
788                                         path_trace(task, tile, kg, coverage_object, coverage_material, coverage_asset);
789                                 }
790                                 if(kg->__data.film.use_cryptomatte & CRYPT_ACCURATE) {
791                                         int aov_index = 0;
792                                         if(kg->__data.film.use_cryptomatte & CRYPT_OBJECT) {
793                                                 aov_index += flatten_coverage(kg, coverage_object, tile, aov_index);
794                                         }
795                                         if(kg->__data.film.use_cryptomatte & CRYPT_MATERIAL) {
796                                                 aov_index += flatten_coverage(kg, coverage_material, tile, aov_index);
797                                         }
798                                         if(kg->__data.film.use_cryptomatte & CRYPT_ASSET) {
799                                                 aov_index += flatten_coverage(kg, coverage_asset, tile, aov_index);
800                                         }
801                                 }
802                         }
803                         else if(tile.task == RenderTile::DENOISE) {
804                                 denoise(denoising, tile);
805                                 task.update_progress(&tile, tile.w*tile.h);
806                         }
807
808                         task.release_tile(tile);
809
810                         if(task_pool.canceled()) {
811                                 if(task.need_finish_queue == false)
812                                         break;
813                         }
814                 }
815
816                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
817                 kg->~KernelGlobals();
818                 kgbuffer.free();
819                 delete split_kernel;
820         }
821
822         void thread_film_convert(DeviceTask& task)
823         {
824                 float sample_scale = 1.0f/(task.sample + 1);
825
826                 if(task.rgba_half) {
827                         for(int y = task.y; y < task.y + task.h; y++)
828                                 for(int x = task.x; x < task.x + task.w; x++)
829                                         convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
830                                                                        sample_scale, x, y, task.offset, task.stride);
831                 }
832                 else {
833                         for(int y = task.y; y < task.y + task.h; y++)
834                                 for(int x = task.x; x < task.x + task.w; x++)
835                                         convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
836                                                                  sample_scale, x, y, task.offset, task.stride);
837
838                 }
839         }
840
841         void thread_shader(DeviceTask& task)
842         {
843                 KernelGlobals kg = kernel_globals;
844
845 #ifdef WITH_OSL
846                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
847 #endif
848                 for(int sample = 0; sample < task.num_samples; sample++) {
849                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
850                                 shader_kernel()(&kg,
851                                                 (uint4*)task.shader_input,
852                                                 (float4*)task.shader_output,
853                                                 task.shader_eval_type,
854                                                 task.shader_filter,
855                                                 x,
856                                                 task.offset,
857                                                 sample);
858
859                         if(task.get_cancel() || task_pool.canceled())
860                                 break;
861
862                         task.update_progress(NULL);
863
864                 }
865
866 #ifdef WITH_OSL
867                 OSLShader::thread_free(&kg);
868 #endif
869         }
870
871         int get_split_task_count(DeviceTask& task)
872         {
873                 if(task.type == DeviceTask::SHADER)
874                         return task.get_subtask_count(info.cpu_threads, 256);
875                 else
876                         return task.get_subtask_count(info.cpu_threads);
877         }
878
879         void task_add(DeviceTask& task)
880         {
881                 /* Load texture info. */
882                 load_texture_info();
883
884                 /* split task into smaller ones */
885                 list<DeviceTask> tasks;
886
887                 if(task.type == DeviceTask::SHADER)
888                         task.split(tasks, info.cpu_threads, 256);
889                 else
890                         task.split(tasks, info.cpu_threads);
891
892                 foreach(DeviceTask& task, tasks)
893                         task_pool.push(new CPUDeviceTask(this, task));
894         }
895
896         void task_wait()
897         {
898                 task_pool.wait_work();
899         }
900
901         void task_cancel()
902         {
903                 task_pool.cancel();
904         }
905
906 protected:
907         inline KernelGlobals thread_kernel_globals_init()
908         {
909                 KernelGlobals kg = kernel_globals;
910                 kg.transparent_shadow_intersections = NULL;
911                 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
912                                             sizeof(*kg.decoupled_volume_steps);
913                 for(int i = 0; i < decoupled_count; ++i) {
914                         kg.decoupled_volume_steps[i] = NULL;
915                 }
916                 kg.decoupled_volume_steps_index = 0;
917 #ifdef WITH_OSL
918                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
919 #endif
920                 return kg;
921         }
922
923         inline void thread_kernel_globals_free(KernelGlobals *kg)
924         {
925                 if(kg == NULL) {
926                         return;
927                 }
928
929                 if(kg->transparent_shadow_intersections != NULL) {
930                         free(kg->transparent_shadow_intersections);
931                 }
932                 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
933                                             sizeof(*kg->decoupled_volume_steps);
934                 for(int i = 0; i < decoupled_count; ++i) {
935                         if(kg->decoupled_volume_steps[i] != NULL) {
936                                 free(kg->decoupled_volume_steps[i]);
937                         }
938                 }
939 #ifdef WITH_OSL
940                 OSLShader::thread_free(kg);
941 #endif
942         }
943
944         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features_) {
945                 requested_features = requested_features_;
946
947                 return true;
948         }
949 };
950
951 /* split kernel */
952
953 class CPUSplitKernelFunction : public SplitKernelFunction {
954 public:
955         CPUDevice* device;
956         void (*func)(KernelGlobals *kg, KernelData *data);
957
958         CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
959         ~CPUSplitKernelFunction() {}
960
961         virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
962         {
963                 if(!func) {
964                         return false;
965                 }
966
967                 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
968                 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
969
970                 for(int y = 0; y < dim.global_size[1]; y++) {
971                         for(int x = 0; x < dim.global_size[0]; x++) {
972                                 kg->global_id = make_int2(x, y);
973
974                                 func(kg, (KernelData*)data.device_pointer);
975                         }
976                 }
977
978                 return true;
979         }
980 };
981
982 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
983 {
984 }
985
986 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
987                                                     RenderTile& rtile,
988                                                     int num_global_elements,
989                                                     device_memory& kernel_globals,
990                                                     device_memory& data,
991                                                     device_memory& split_data,
992                                                     device_memory& ray_state,
993                                                     device_memory& queue_index,
994                                                     device_memory& use_queues_flags,
995                                                     device_memory& work_pool_wgs)
996 {
997         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
998         kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
999
1000         for(int y = 0; y < dim.global_size[1]; y++) {
1001                 for(int x = 0; x < dim.global_size[0]; x++) {
1002                         kg->global_id = make_int2(x, y);
1003
1004                         device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
1005                                                    (KernelData*)data.device_pointer,
1006                                                    (void*)split_data.device_pointer,
1007                                                    num_global_elements,
1008                                                    (char*)ray_state.device_pointer,
1009                                                    rtile.start_sample,
1010                                                    rtile.start_sample + rtile.num_samples,
1011                                                    rtile.x,
1012                                                    rtile.y,
1013                                                    rtile.w,
1014                                                    rtile.h,
1015                                                    rtile.offset,
1016                                                    rtile.stride,
1017                                                    (int*)queue_index.device_pointer,
1018                                                    dim.global_size[0] * dim.global_size[1],
1019                                                    (char*)use_queues_flags.device_pointer,
1020                                                    (uint*)work_pool_wgs.device_pointer,
1021                                                    rtile.num_samples,
1022                                                    (float*)rtile.buffer);
1023                 }
1024         }
1025
1026         return true;
1027 }
1028
1029 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
1030                                                                const DeviceRequestedFeatures&)
1031 {
1032         CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
1033
1034         kernel->func = device->split_kernels[kernel_name]();
1035         if(!kernel->func) {
1036                 delete kernel;
1037                 return NULL;
1038         }
1039
1040         return kernel;
1041 }
1042
1043 int2 CPUSplitKernel::split_kernel_local_size()
1044 {
1045         return make_int2(1, 1);
1046 }
1047
1048 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
1049         return make_int2(1, 1);
1050 }
1051
1052 uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
1053         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
1054
1055         return split_data_buffer_size(kg, num_threads);
1056 }
1057
1058 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
1059 {
1060         return new CPUDevice(info, stats, background);
1061 }
1062
1063 void device_cpu_info(vector<DeviceInfo>& devices)
1064 {
1065         DeviceInfo info;
1066
1067         info.type = DEVICE_CPU;
1068         info.description = system_cpu_brand_string();
1069         info.id = "CPU";
1070         info.num = 0;
1071         info.advanced_shading = true;
1072         info.bvh_layout_mask = BVH_LAYOUT_BVH2;
1073         if (system_cpu_support_sse2()) {
1074                 info.bvh_layout_mask |= BVH_LAYOUT_BVH4;
1075         }
1076         info.has_volume_decoupled = true;
1077         info.has_osl = true;
1078         info.has_half_images = true;
1079
1080         devices.insert(devices.begin(), info);
1081 }
1082
1083 string device_cpu_capabilities(void)
1084 {
1085         string capabilities = "";
1086         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
1087         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
1088         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
1089         capabilities += system_cpu_support_avx() ? "AVX " : "";
1090         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
1091         if(capabilities[capabilities.size() - 1] == ' ')
1092                 capabilities.resize(capabilities.size() - 1);
1093         return capabilities;
1094 }
1095
1096 CCL_NAMESPACE_END