Merge branch 'blender2.7'
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util/util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device/device.h"
28 #include "device/device_denoising.h"
29 #include "device/device_intern.h"
30 #include "device/device_split_kernel.h"
31
32 #include "kernel/kernel.h"
33 #include "kernel/kernel_compat_cpu.h"
34 #include "kernel/kernel_types.h"
35 #include "kernel/split/kernel_split_data.h"
36 #include "kernel/kernel_globals.h"
37
38 #include "kernel/filter/filter.h"
39
40 #include "kernel/osl/osl_shader.h"
41 #include "kernel/osl/osl_globals.h"
42
43 #include "render/buffers.h"
44 #include "render/coverage.h"
45
46 #include "util/util_debug.h"
47 #include "util/util_foreach.h"
48 #include "util/util_function.h"
49 #include "util/util_logging.h"
50 #include "util/util_map.h"
51 #include "util/util_opengl.h"
52 #include "util/util_optimization.h"
53 #include "util/util_progress.h"
54 #include "util/util_system.h"
55 #include "util/util_thread.h"
56
57 CCL_NAMESPACE_BEGIN
58
59 class CPUDevice;
60
61 /* Has to be outside of the class to be shared across template instantiations. */
62 static const char *logged_architecture = "";
63
64 template<typename F>
65 class KernelFunctions {
66 public:
67         KernelFunctions()
68         {
69                 kernel = (F)NULL;
70         }
71
72         KernelFunctions(F kernel_default,
73                         F kernel_sse2,
74                         F kernel_sse3,
75                         F kernel_sse41,
76                         F kernel_avx,
77                         F kernel_avx2)
78         {
79                 const char *architecture_name = "default";
80                 kernel = kernel_default;
81
82                 /* Silence potential warnings about unused variables
83                  * when compiling without some architectures. */
84                 (void) kernel_sse2;
85                 (void) kernel_sse3;
86                 (void) kernel_sse41;
87                 (void) kernel_avx;
88                 (void) kernel_avx2;
89 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
90                 if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
91                         architecture_name = "AVX2";
92                         kernel = kernel_avx2;
93                 }
94                 else
95 #endif
96 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
97                 if(DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
98                         architecture_name = "AVX";
99                         kernel = kernel_avx;
100                 }
101                 else
102 #endif
103 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
104                 if(DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
105                         architecture_name = "SSE4.1";
106                         kernel = kernel_sse41;
107                 }
108                 else
109 #endif
110 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
111                 if(DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
112                         architecture_name = "SSE3";
113                         kernel = kernel_sse3;
114                 }
115                 else
116 #endif
117 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
118                 if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
119                         architecture_name = "SSE2";
120                         kernel = kernel_sse2;
121                 }
122 #endif
123
124                 if(strcmp(architecture_name, logged_architecture) != 0) {
125                         VLOG(1) << "Will be using " << architecture_name << " kernels.";
126                         logged_architecture = architecture_name;
127                 }
128         }
129
130         inline F operator()() const {
131                 assert(kernel);
132                 return kernel;
133         }
134 protected:
135         F kernel;
136 };
137
138 class CPUSplitKernel : public DeviceSplitKernel {
139         CPUDevice *device;
140 public:
141         explicit CPUSplitKernel(CPUDevice *device);
142
143         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
144                                                     RenderTile& rtile,
145                                                     int num_global_elements,
146                                                     device_memory& kernel_globals,
147                                                     device_memory& kernel_data_,
148                                                     device_memory& split_data,
149                                                     device_memory& ray_state,
150                                                     device_memory& queue_index,
151                                                     device_memory& use_queues_flag,
152                                                     device_memory& work_pool_wgs);
153
154         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
155                                                                const DeviceRequestedFeatures&);
156         virtual int2 split_kernel_local_size();
157         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
158         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
159 };
160
161 class CPUDevice : public Device
162 {
163 public:
164         TaskPool task_pool;
165         KernelGlobals kernel_globals;
166
167         device_vector<TextureInfo> texture_info;
168         bool need_texture_info;
169
170 #ifdef WITH_OSL
171         OSLGlobals osl_globals;
172 #endif
173
174         bool use_split_kernel;
175
176         DeviceRequestedFeatures requested_features;
177
178         KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)>             path_trace_kernel;
179         KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
180         KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
181         KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>   shader_kernel;
182
183         KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)>  filter_divide_shadow_kernel;
184         KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)>         filter_get_feature_kernel;
185         KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)>                                    filter_write_feature_kernel;
186         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
187         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
188
189         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel;
190         KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_blur_kernel;
191         KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_calc_weight_kernel;
192         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)>       filter_nlm_update_output_kernel;
193         KernelFunctions<void(*)(float*, float*, int*, int)>                                                   filter_nlm_normalize_kernel;
194
195         KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)>                   filter_construct_transform_kernel;
196         KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel;
197         KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                                            filter_finalize_kernel;
198
199         KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
200                                int, int, int, int, int, int, int, int, ccl_global int*, int,
201                                ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)>        data_init_kernel;
202         unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
203
204 #define KERNEL_FUNCTIONS(name) \
205               KERNEL_NAME_EVAL(cpu, name), \
206               KERNEL_NAME_EVAL(cpu_sse2, name), \
207               KERNEL_NAME_EVAL(cpu_sse3, name), \
208               KERNEL_NAME_EVAL(cpu_sse41, name), \
209               KERNEL_NAME_EVAL(cpu_avx, name), \
210               KERNEL_NAME_EVAL(cpu_avx2, name)
211
212         CPUDevice(DeviceInfo& info_, Stats &stats_, Profiler &profiler_, bool background_)
213         : Device(info_, stats_, profiler_, background_),
214           texture_info(this, "__texture_info", MEM_TEXTURE),
215 #define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
216           REGISTER_KERNEL(path_trace),
217           REGISTER_KERNEL(convert_to_half_float),
218           REGISTER_KERNEL(convert_to_byte),
219           REGISTER_KERNEL(shader),
220           REGISTER_KERNEL(filter_divide_shadow),
221           REGISTER_KERNEL(filter_get_feature),
222           REGISTER_KERNEL(filter_write_feature),
223           REGISTER_KERNEL(filter_detect_outliers),
224           REGISTER_KERNEL(filter_combine_halves),
225           REGISTER_KERNEL(filter_nlm_calc_difference),
226           REGISTER_KERNEL(filter_nlm_blur),
227           REGISTER_KERNEL(filter_nlm_calc_weight),
228           REGISTER_KERNEL(filter_nlm_update_output),
229           REGISTER_KERNEL(filter_nlm_normalize),
230           REGISTER_KERNEL(filter_construct_transform),
231           REGISTER_KERNEL(filter_nlm_construct_gramian),
232           REGISTER_KERNEL(filter_finalize),
233           REGISTER_KERNEL(data_init)
234 #undef REGISTER_KERNEL
235         {
236                 if(info.cpu_threads == 0) {
237                         info.cpu_threads = TaskScheduler::num_threads();
238                 }
239
240 #ifdef WITH_OSL
241                 kernel_globals.osl = &osl_globals;
242 #endif
243                 use_split_kernel = DebugFlags().cpu.split_kernel;
244                 if(use_split_kernel) {
245                         VLOG(1) << "Will be using split kernel.";
246                 }
247                 need_texture_info = false;
248
249 #define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
250                 REGISTER_SPLIT_KERNEL(path_init);
251                 REGISTER_SPLIT_KERNEL(scene_intersect);
252                 REGISTER_SPLIT_KERNEL(lamp_emission);
253                 REGISTER_SPLIT_KERNEL(do_volume);
254                 REGISTER_SPLIT_KERNEL(queue_enqueue);
255                 REGISTER_SPLIT_KERNEL(indirect_background);
256                 REGISTER_SPLIT_KERNEL(shader_setup);
257                 REGISTER_SPLIT_KERNEL(shader_sort);
258                 REGISTER_SPLIT_KERNEL(shader_eval);
259                 REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
260                 REGISTER_SPLIT_KERNEL(subsurface_scatter);
261                 REGISTER_SPLIT_KERNEL(direct_lighting);
262                 REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
263                 REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
264                 REGISTER_SPLIT_KERNEL(enqueue_inactive);
265                 REGISTER_SPLIT_KERNEL(next_iteration_setup);
266                 REGISTER_SPLIT_KERNEL(indirect_subsurface);
267                 REGISTER_SPLIT_KERNEL(buffer_update);
268 #undef REGISTER_SPLIT_KERNEL
269 #undef KERNEL_FUNCTIONS
270         }
271
272         ~CPUDevice()
273         {
274                 task_pool.stop();
275                 texture_info.free();
276         }
277
278         virtual bool show_samples() const
279         {
280                 return (info.cpu_threads == 1);
281         }
282
283         virtual BVHLayoutMask get_bvh_layout_mask() const {
284                 BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
285                 if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
286                         bvh_layout_mask |= BVH_LAYOUT_BVH4;
287                 }
288                 if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
289                         bvh_layout_mask |= BVH_LAYOUT_BVH8;
290                 }
291 #ifdef WITH_EMBREE
292                 bvh_layout_mask |= BVH_LAYOUT_EMBREE;
293 #endif  /* WITH_EMBREE */
294                 return bvh_layout_mask;
295         }
296
297         void load_texture_info()
298         {
299                 if(need_texture_info) {
300                         texture_info.copy_to_device();
301                         need_texture_info = false;
302                 }
303         }
304
305         void mem_alloc(device_memory& mem)
306         {
307                 if(mem.type == MEM_TEXTURE) {
308                         assert(!"mem_alloc not supported for textures.");
309                 }
310                 else {
311                         if(mem.name) {
312                                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
313                                                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
314                                                 << string_human_readable_size(mem.memory_size()) << ")";
315                         }
316
317                         if(mem.type == MEM_DEVICE_ONLY) {
318                                 assert(!mem.host_pointer);
319                                 size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
320                                 void *data = util_aligned_malloc(mem.memory_size(), alignment);
321                                 mem.device_pointer = (device_ptr)data;
322                         }
323                         else {
324                                 mem.device_pointer = (device_ptr)mem.host_pointer;
325                         }
326
327                         mem.device_size = mem.memory_size();
328                         stats.mem_alloc(mem.device_size);
329                 }
330         }
331
332         void mem_copy_to(device_memory& mem)
333         {
334                 if(mem.type == MEM_TEXTURE) {
335                         tex_free(mem);
336                         tex_alloc(mem);
337                 }
338                 else if(mem.type == MEM_PIXELS) {
339                         assert(!"mem_copy_to not supported for pixels.");
340                 }
341                 else {
342                         if(!mem.device_pointer) {
343                                 mem_alloc(mem);
344                         }
345
346                         /* copy is no-op */
347                 }
348         }
349
350         void mem_copy_from(device_memory& /*mem*/,
351                            int /*y*/, int /*w*/, int /*h*/,
352                            int /*elem*/)
353         {
354                 /* no-op */
355         }
356
357         void mem_zero(device_memory& mem)
358         {
359                 if(!mem.device_pointer) {
360                         mem_alloc(mem);
361                 }
362
363                 if(mem.device_pointer) {
364                         memset((void*)mem.device_pointer, 0, mem.memory_size());
365                 }
366         }
367
368         void mem_free(device_memory& mem)
369         {
370                 if(mem.type == MEM_TEXTURE) {
371                         tex_free(mem);
372                 }
373                 else if(mem.device_pointer) {
374                         if(mem.type == MEM_DEVICE_ONLY) {
375                                 util_aligned_free((void*)mem.device_pointer);
376                         }
377                         mem.device_pointer = 0;
378                         stats.mem_free(mem.device_size);
379                         mem.device_size = 0;
380                 }
381         }
382
383         virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
384         {
385                 return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
386         }
387
388         void const_copy_to(const char *name, void *host, size_t size)
389         {
390                 kernel_const_copy(&kernel_globals, name, host, size);
391         }
392
393         void tex_alloc(device_memory& mem)
394         {
395                 VLOG(1) << "Texture allocate: " << mem.name << ", "
396                         << string_human_readable_number(mem.memory_size()) << " bytes. ("
397                         << string_human_readable_size(mem.memory_size()) << ")";
398
399                 if(mem.interpolation == INTERPOLATION_NONE) {
400                         /* Data texture. */
401                         kernel_tex_copy(&kernel_globals,
402                                                         mem.name,
403                                                         mem.host_pointer,
404                                                         mem.data_size);
405                 }
406                 else {
407                         /* Image Texture. */
408                         int flat_slot = 0;
409                         if(string_startswith(mem.name, "__tex_image")) {
410                                 int pos =  string(mem.name).rfind("_");
411                                 flat_slot = atoi(mem.name + pos + 1);
412                         }
413                         else {
414                                 assert(0);
415                         }
416
417                         if(flat_slot >= texture_info.size()) {
418                                 /* Allocate some slots in advance, to reduce amount
419                                  * of re-allocations. */
420                                 texture_info.resize(flat_slot + 128);
421                         }
422
423                         TextureInfo& info = texture_info[flat_slot];
424                         info.data = (uint64_t)mem.host_pointer;
425                         info.cl_buffer = 0;
426                         info.interpolation = mem.interpolation;
427                         info.extension = mem.extension;
428                         info.width = mem.data_width;
429                         info.height = mem.data_height;
430                         info.depth = mem.data_depth;
431
432                         need_texture_info = true;
433                 }
434
435                 mem.device_pointer = (device_ptr)mem.host_pointer;
436                 mem.device_size = mem.memory_size();
437                 stats.mem_alloc(mem.device_size);
438         }
439
440         void tex_free(device_memory& mem)
441         {
442                 if(mem.device_pointer) {
443                         mem.device_pointer = 0;
444                         stats.mem_free(mem.device_size);
445                         mem.device_size = 0;
446                         need_texture_info = true;
447                 }
448         }
449
450         void *osl_memory()
451         {
452 #ifdef WITH_OSL
453                 return &osl_globals;
454 #else
455                 return NULL;
456 #endif
457         }
458
459         void thread_run(DeviceTask *task)
460         {
461                 if(task->type == DeviceTask::RENDER) {
462                         thread_render(*task);
463                 }
464                 else if(task->type == DeviceTask::FILM_CONVERT)
465                         thread_film_convert(*task);
466                 else if(task->type == DeviceTask::SHADER)
467                         thread_shader(*task);
468         }
469
470         class CPUDeviceTask : public DeviceTask {
471         public:
472                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
473                 : DeviceTask(task)
474                 {
475                         run = function_bind(&CPUDevice::thread_run, device, this);
476                 }
477         };
478
479         bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
480                                        DenoisingTask *task)
481         {
482                 ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
483
484                 int4 rect = task->rect;
485                 int   r   = task->nlm_state.r;
486                 int   f   = task->nlm_state.f;
487                 float a   = task->nlm_state.a;
488                 float k_2 = task->nlm_state.k_2;
489
490                 int w = align_up(rect.z-rect.x, 4);
491                 int h = rect.w-rect.y;
492                 int stride = task->buffer.stride;
493                 int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
494
495                 float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
496                 float *blurDifference = temporary_mem;
497                 float *difference     = temporary_mem + task->buffer.pass_stride;
498                 float *weightAccum    = temporary_mem + 2*task->buffer.pass_stride;
499
500                 memset(weightAccum, 0, sizeof(float)*w*h);
501                 memset((float*) out_ptr, 0, sizeof(float)*w*h);
502
503                 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
504                         int dy = i / (2*r+1) - r;
505                         int dx = i % (2*r+1) - r;
506
507                         int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
508                         filter_nlm_calc_difference_kernel()(dx, dy,
509                                                             (float*) guide_ptr,
510                                                             (float*) variance_ptr,
511                                                             NULL,
512                                                             difference,
513                                                             local_rect,
514                                                             w, channel_offset,
515                                                             0, a, k_2);
516
517                         filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
518                         filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
519                         filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
520
521                         filter_nlm_update_output_kernel()(dx, dy,
522                                                           blurDifference,
523                                                           (float*) image_ptr,
524                                                           difference,
525                                                           (float*) out_ptr,
526                                                           weightAccum,
527                                                           local_rect,
528                                                           channel_offset,
529                                                           stride, f);
530                 }
531
532                 int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
533                 filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
534
535                 return true;
536         }
537
538         bool denoising_construct_transform(DenoisingTask *task)
539         {
540                 ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
541
542                 for(int y = 0; y < task->filter_area.w; y++) {
543                         for(int x = 0; x < task->filter_area.z; x++) {
544                                 filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
545                                                                     task->tile_info,
546                                                                     x + task->filter_area.x,
547                                                                     y + task->filter_area.y,
548                                                                     y*task->filter_area.z + x,
549                                                                     (float*) task->storage.transform.device_pointer,
550                                                                     (int*)   task->storage.rank.device_pointer,
551                                                                     &task->rect.x,
552                                                                     task->buffer.pass_stride,
553                                                                     task->buffer.frame_stride,
554                                                                     task->buffer.use_time,
555                                                                     task->radius,
556                                                                     task->pca_threshold);
557                         }
558                 }
559                 return true;
560         }
561
562         bool denoising_accumulate(device_ptr color_ptr,
563                                   device_ptr color_variance_ptr,
564                                   device_ptr scale_ptr,
565                                   int frame,
566                                   DenoisingTask *task)
567         {
568                 ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
569
570                 float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
571                 float *difference     = temporary_mem;
572                 float *blurDifference = temporary_mem + task->buffer.pass_stride;
573
574                 int r = task->radius;
575                 int frame_offset = frame * task->buffer.frame_stride;
576                 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
577                         int dy = i / (2*r+1) - r;
578                         int dx = i % (2*r+1) - r;
579
580                         int local_rect[4] = {max(0, -dx), max(0, -dy),
581                                              task->reconstruction_state.source_w - max(0, dx),
582                                              task->reconstruction_state.source_h - max(0, dy)};
583                         filter_nlm_calc_difference_kernel()(dx, dy,
584                                                             (float*) color_ptr,
585                                                             (float*) color_variance_ptr,
586                                                             (float*) scale_ptr,
587                                                             difference,
588                                                             local_rect,
589                                                             task->buffer.stride,
590                                                             task->buffer.pass_stride,
591                                                             frame_offset,
592                                                             1.0f,
593                                                             task->nlm_k_2);
594                         filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
595                         filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
596                         filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
597                         filter_nlm_construct_gramian_kernel()(dx, dy,
598                                                               task->tile_info->frames[frame],
599                                                               blurDifference,
600                                                               (float*)  task->buffer.mem.device_pointer,
601                                                               (float*)  task->storage.transform.device_pointer,
602                                                               (int*)    task->storage.rank.device_pointer,
603                                                               (float*)  task->storage.XtWX.device_pointer,
604                                                               (float3*) task->storage.XtWY.device_pointer,
605                                                               local_rect,
606                                                               &task->reconstruction_state.filter_window.x,
607                                                               task->buffer.stride,
608                                                               4,
609                                                               task->buffer.pass_stride,
610                                                               frame_offset,
611                                                               task->buffer.use_time);
612                 }
613
614                 return true;
615         }
616
617         bool denoising_solve(device_ptr output_ptr,
618                              DenoisingTask *task)
619         {
620                 for(int y = 0; y < task->filter_area.w; y++) {
621                         for(int x = 0; x < task->filter_area.z; x++) {
622                                 filter_finalize_kernel()(x,
623                                                          y,
624                                                          y*task->filter_area.z + x,
625                                                          (float*)  output_ptr,
626                                                          (int*)    task->storage.rank.device_pointer,
627                                                          (float*)  task->storage.XtWX.device_pointer,
628                                                          (float3*) task->storage.XtWY.device_pointer,
629                                                          &task->reconstruction_state.buffer_params.x,
630                                                          task->render_buffer.samples);
631                         }
632                 }
633                 return true;
634         }
635
636         bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
637                                       device_ptr mean_ptr, device_ptr variance_ptr,
638                                       int r, int4 rect, DenoisingTask *task)
639         {
640                 ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
641
642                 for(int y = rect.y; y < rect.w; y++) {
643                         for(int x = rect.x; x < rect.z; x++) {
644                                 filter_combine_halves_kernel()(x, y,
645                                                                (float*) mean_ptr,
646                                                                (float*) variance_ptr,
647                                                                (float*) a_ptr,
648                                                                (float*) b_ptr,
649                                                                &rect.x,
650                                                                r);
651                         }
652                 }
653                 return true;
654         }
655
656         bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
657                                      device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
658                                      device_ptr buffer_variance_ptr, DenoisingTask *task)
659         {
660                 ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
661
662                 for(int y = task->rect.y; y < task->rect.w; y++) {
663                         for(int x = task->rect.x; x < task->rect.z; x++) {
664                                 filter_divide_shadow_kernel()(task->render_buffer.samples,
665                                                               task->tile_info,
666                                                               x, y,
667                                                               (float*) a_ptr,
668                                                               (float*) b_ptr,
669                                                               (float*) sample_variance_ptr,
670                                                               (float*) sv_variance_ptr,
671                                                               (float*) buffer_variance_ptr,
672                                                               &task->rect.x,
673                                                               task->render_buffer.pass_stride,
674                                                               task->render_buffer.offset);
675                         }
676                 }
677                 return true;
678         }
679
680         bool denoising_get_feature(int mean_offset,
681                                    int variance_offset,
682                                    device_ptr mean_ptr,
683                                    device_ptr variance_ptr,
684                                    float scale,
685                                    DenoisingTask *task)
686         {
687                 ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
688
689                 for(int y = task->rect.y; y < task->rect.w; y++) {
690                         for(int x = task->rect.x; x < task->rect.z; x++) {
691                                 filter_get_feature_kernel()(task->render_buffer.samples,
692                                                             task->tile_info,
693                                                             mean_offset,
694                                                             variance_offset,
695                                                             x, y,
696                                                             (float*) mean_ptr,
697                                                             (float*) variance_ptr,
698                                                             scale,
699                                                             &task->rect.x,
700                                                             task->render_buffer.pass_stride,
701                                                             task->render_buffer.offset);
702                         }
703                 }
704                 return true;
705         }
706
707         bool denoising_write_feature(int out_offset,
708                                      device_ptr from_ptr,
709                                      device_ptr buffer_ptr,
710                                      DenoisingTask *task)
711         {
712                 for(int y = 0; y < task->filter_area.w; y++) {
713                         for(int x = 0; x < task->filter_area.z; x++) {
714                                 filter_write_feature_kernel()(task->render_buffer.samples,
715                                                               x + task->filter_area.x,
716                                                               y + task->filter_area.y,
717                                                               &task->reconstruction_state.buffer_params.x,
718                                                               (float*) from_ptr,
719                                                               (float*) buffer_ptr,
720                                                               out_offset,
721                                                               &task->rect.x);
722                         }
723                 }
724                 return true;
725         }
726
727         bool denoising_detect_outliers(device_ptr image_ptr,
728                                        device_ptr variance_ptr,
729                                        device_ptr depth_ptr,
730                                        device_ptr output_ptr,
731                                        DenoisingTask *task)
732         {
733                 ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
734
735                 for(int y = task->rect.y; y < task->rect.w; y++) {
736                         for(int x = task->rect.x; x < task->rect.z; x++) {
737                                 filter_detect_outliers_kernel()(x, y,
738                                                                 (float*) image_ptr,
739                                                                 (float*) variance_ptr,
740                                                                 (float*) depth_ptr,
741                                                                 (float*) output_ptr,
742                                                                 &task->rect.x,
743                                                                 task->buffer.pass_stride);
744                         }
745                 }
746                 return true;
747         }
748
749         void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
750         {
751                 const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
752
753                 scoped_timer timer(&tile.buffers->render_time);
754
755                 Coverage coverage(kg, tile);
756                 if(use_coverage) {
757                         coverage.init_path_trace();
758                 }
759
760                 float *render_buffer = (float*)tile.buffer;
761                 int start_sample = tile.start_sample;
762                 int end_sample = tile.start_sample + tile.num_samples;
763
764                 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
765                 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
766
767                 for(int sample = start_sample; sample < end_sample; sample++) {
768                         if(task.get_cancel() || task_pool.canceled()) {
769                                 if(task.need_finish_queue == false)
770                                         break;
771                         }
772
773                         for(int y = tile.y; y < tile.y + tile.h; y++) {
774                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
775                                         if(use_coverage) {
776                                                 coverage.init_pixel(x, y);
777                                         }
778                                         path_trace_kernel()(kg, render_buffer,
779                                                             sample, x, y, tile.offset, tile.stride);
780                                 }
781                         }
782
783                         tile.sample = sample + 1;
784
785                         task.update_progress(&tile, tile.w*tile.h);
786                 }
787                 if(use_coverage) {
788                         coverage.finalize();
789                 }
790         }
791
792         void denoise(DenoisingTask& denoising, RenderTile &tile)
793         {
794                 ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
795
796                 tile.sample = tile.start_sample + tile.num_samples;
797
798                 denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
799                 denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
800                 denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
801                 denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
802                 denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
803                 denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
804                 denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
805                 denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
806                 denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
807
808                 denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
809                 denoising.render_buffer.samples = tile.sample;
810                 denoising.buffer.gpu_temporary_mem = false;
811
812                 denoising.run_denoising(&tile);
813         }
814
815         void thread_render(DeviceTask& task)
816         {
817                 if(task_pool.canceled()) {
818                         if(task.need_finish_queue == false)
819                                 return;
820                 }
821
822                 /* allocate buffer for kernel globals */
823                 device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
824                 kgbuffer.alloc_to_device(1);
825
826                 KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
827
828                 profiler.add_state(&kg->profiler);
829
830                 CPUSplitKernel *split_kernel = NULL;
831                 if(use_split_kernel) {
832                         split_kernel = new CPUSplitKernel(this);
833                         if(!split_kernel->load_kernels(requested_features)) {
834                                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
835                                 kgbuffer.free();
836                                 delete split_kernel;
837                                 return;
838                         }
839                 }
840
841                 RenderTile tile;
842                 DenoisingTask denoising(this, task);
843                 denoising.profiler = &kg->profiler;
844
845                 while(task.acquire_tile(this, tile)) {
846                         if(tile.task == RenderTile::PATH_TRACE) {
847                                 if(use_split_kernel) {
848                                         device_only_memory<uchar> void_buffer(this, "void_buffer");
849                                         split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
850                                 }
851                                 else {
852                                         path_trace(task, tile, kg);
853                                 }
854                         }
855                         else if(tile.task == RenderTile::DENOISE) {
856                                 denoise(denoising, tile);
857                                 task.update_progress(&tile, tile.w*tile.h);
858                         }
859
860                         task.release_tile(tile);
861
862                         if(task_pool.canceled()) {
863                                 if(task.need_finish_queue == false)
864                                         break;
865                         }
866                 }
867
868                 profiler.remove_state(&kg->profiler);
869
870                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
871                 kg->~KernelGlobals();
872                 kgbuffer.free();
873                 delete split_kernel;
874         }
875
876         void thread_film_convert(DeviceTask& task)
877         {
878                 float sample_scale = 1.0f/(task.sample + 1);
879
880                 if(task.rgba_half) {
881                         for(int y = task.y; y < task.y + task.h; y++)
882                                 for(int x = task.x; x < task.x + task.w; x++)
883                                         convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
884                                                                        sample_scale, x, y, task.offset, task.stride);
885                 }
886                 else {
887                         for(int y = task.y; y < task.y + task.h; y++)
888                                 for(int x = task.x; x < task.x + task.w; x++)
889                                         convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
890                                                                  sample_scale, x, y, task.offset, task.stride);
891
892                 }
893         }
894
895         void thread_shader(DeviceTask& task)
896         {
897                 KernelGlobals kg = kernel_globals;
898
899 #ifdef WITH_OSL
900                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
901 #endif
902                 for(int sample = 0; sample < task.num_samples; sample++) {
903                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
904                                 shader_kernel()(&kg,
905                                                 (uint4*)task.shader_input,
906                                                 (float4*)task.shader_output,
907                                                 task.shader_eval_type,
908                                                 task.shader_filter,
909                                                 x,
910                                                 task.offset,
911                                                 sample);
912
913                         if(task.get_cancel() || task_pool.canceled())
914                                 break;
915
916                         task.update_progress(NULL);
917
918                 }
919
920 #ifdef WITH_OSL
921                 OSLShader::thread_free(&kg);
922 #endif
923         }
924
925         int get_split_task_count(DeviceTask& task)
926         {
927                 if(task.type == DeviceTask::SHADER)
928                         return task.get_subtask_count(info.cpu_threads, 256);
929                 else
930                         return task.get_subtask_count(info.cpu_threads);
931         }
932
933         void task_add(DeviceTask& task)
934         {
935                 /* Load texture info. */
936                 load_texture_info();
937
938                 /* split task into smaller ones */
939                 list<DeviceTask> tasks;
940
941                 if(task.type == DeviceTask::SHADER)
942                         task.split(tasks, info.cpu_threads, 256);
943                 else
944                         task.split(tasks, info.cpu_threads);
945
946                 foreach(DeviceTask& task, tasks)
947                         task_pool.push(new CPUDeviceTask(this, task));
948         }
949
950         void task_wait()
951         {
952                 task_pool.wait_work();
953         }
954
955         void task_cancel()
956         {
957                 task_pool.cancel();
958         }
959
960 protected:
961         inline KernelGlobals thread_kernel_globals_init()
962         {
963                 KernelGlobals kg = kernel_globals;
964                 kg.transparent_shadow_intersections = NULL;
965                 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
966                                             sizeof(*kg.decoupled_volume_steps);
967                 for(int i = 0; i < decoupled_count; ++i) {
968                         kg.decoupled_volume_steps[i] = NULL;
969                 }
970                 kg.decoupled_volume_steps_index = 0;
971                 kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
972 #ifdef WITH_OSL
973                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
974 #endif
975                 return kg;
976         }
977
978         inline void thread_kernel_globals_free(KernelGlobals *kg)
979         {
980                 if(kg == NULL) {
981                         return;
982                 }
983
984                 if(kg->transparent_shadow_intersections != NULL) {
985                         free(kg->transparent_shadow_intersections);
986                 }
987                 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
988                                             sizeof(*kg->decoupled_volume_steps);
989                 for(int i = 0; i < decoupled_count; ++i) {
990                         if(kg->decoupled_volume_steps[i] != NULL) {
991                                 free(kg->decoupled_volume_steps[i]);
992                         }
993                 }
994 #ifdef WITH_OSL
995                 OSLShader::thread_free(kg);
996 #endif
997         }
998
999         virtual bool load_kernels(const DeviceRequestedFeatures& requested_features_) {
1000                 requested_features = requested_features_;
1001
1002                 return true;
1003         }
1004 };
1005
1006 /* split kernel */
1007
1008 class CPUSplitKernelFunction : public SplitKernelFunction {
1009 public:
1010         CPUDevice* device;
1011         void (*func)(KernelGlobals *kg, KernelData *data);
1012
1013         CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
1014         ~CPUSplitKernelFunction() {}
1015
1016         virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
1017         {
1018                 if(!func) {
1019                         return false;
1020                 }
1021
1022                 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
1023                 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
1024
1025                 for(int y = 0; y < dim.global_size[1]; y++) {
1026                         for(int x = 0; x < dim.global_size[0]; x++) {
1027                                 kg->global_id = make_int2(x, y);
1028
1029                                 func(kg, (KernelData*)data.device_pointer);
1030                         }
1031                 }
1032
1033                 return true;
1034         }
1035 };
1036
1037 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
1038 {
1039 }
1040
1041 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
1042                                                     RenderTile& rtile,
1043                                                     int num_global_elements,
1044                                                     device_memory& kernel_globals,
1045                                                     device_memory& data,
1046                                                     device_memory& split_data,
1047                                                     device_memory& ray_state,
1048                                                     device_memory& queue_index,
1049                                                     device_memory& use_queues_flags,
1050                                                     device_memory& work_pool_wgs)
1051 {
1052         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
1053         kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
1054
1055         for(int y = 0; y < dim.global_size[1]; y++) {
1056                 for(int x = 0; x < dim.global_size[0]; x++) {
1057                         kg->global_id = make_int2(x, y);
1058
1059                         device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
1060                                                    (KernelData*)data.device_pointer,
1061                                                    (void*)split_data.device_pointer,
1062                                                    num_global_elements,
1063                                                    (char*)ray_state.device_pointer,
1064                                                    rtile.start_sample,
1065                                                    rtile.start_sample + rtile.num_samples,
1066                                                    rtile.x,
1067                                                    rtile.y,
1068                                                    rtile.w,
1069                                                    rtile.h,
1070                                                    rtile.offset,
1071                                                    rtile.stride,
1072                                                    (int*)queue_index.device_pointer,
1073                                                    dim.global_size[0] * dim.global_size[1],
1074                                                    (char*)use_queues_flags.device_pointer,
1075                                                    (uint*)work_pool_wgs.device_pointer,
1076                                                    rtile.num_samples,
1077                                                    (float*)rtile.buffer);
1078                 }
1079         }
1080
1081         return true;
1082 }
1083
1084 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
1085                                                                const DeviceRequestedFeatures&)
1086 {
1087         CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
1088
1089         kernel->func = device->split_kernels[kernel_name]();
1090         if(!kernel->func) {
1091                 delete kernel;
1092                 return NULL;
1093         }
1094
1095         return kernel;
1096 }
1097
1098 int2 CPUSplitKernel::split_kernel_local_size()
1099 {
1100         return make_int2(1, 1);
1101 }
1102
1103 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
1104         return make_int2(1, 1);
1105 }
1106
1107 uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
1108         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
1109
1110         return split_data_buffer_size(kg, num_threads);
1111 }
1112
1113 Device *device_cpu_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
1114 {
1115         return new CPUDevice(info, stats, profiler, background);
1116 }
1117
1118 void device_cpu_info(vector<DeviceInfo>& devices)
1119 {
1120         DeviceInfo info;
1121
1122         info.type = DEVICE_CPU;
1123         info.description = system_cpu_brand_string();
1124         info.id = "CPU";
1125         info.num = 0;
1126         info.advanced_shading = true;
1127         info.has_volume_decoupled = true;
1128         info.has_osl = true;
1129         info.has_half_images = true;
1130         info.has_profiling = true;
1131
1132         devices.insert(devices.begin(), info);
1133 }
1134
1135 string device_cpu_capabilities()
1136 {
1137         string capabilities = "";
1138         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
1139         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
1140         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
1141         capabilities += system_cpu_support_avx() ? "AVX " : "";
1142         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
1143         if(capabilities[capabilities.size() - 1] == ' ')
1144                 capabilities.resize(capabilities.size() - 1);
1145         return capabilities;
1146 }
1147
1148 CCL_NAMESPACE_END