Cycles: Replace __MAX_CLOSURE__ build option with runtime integrator variable
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util/util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device/device.h"
28 #include "device/device_denoising.h"
29 #include "device/device_intern.h"
30 #include "device/device_split_kernel.h"
31
32 #include "kernel/kernel.h"
33 #include "kernel/kernel_compat_cpu.h"
34 #include "kernel/kernel_types.h"
35 #include "kernel/split/kernel_split_data.h"
36 #include "kernel/kernel_globals.h"
37
38 #include "kernel/filter/filter.h"
39
40 #include "kernel/osl/osl_shader.h"
41 #include "kernel/osl/osl_globals.h"
42
43 #include "render/buffers.h"
44
45 #include "util/util_debug.h"
46 #include "util/util_foreach.h"
47 #include "util/util_function.h"
48 #include "util/util_logging.h"
49 #include "util/util_map.h"
50 #include "util/util_opengl.h"
51 #include "util/util_optimization.h"
52 #include "util/util_progress.h"
53 #include "util/util_system.h"
54 #include "util/util_thread.h"
55
56 CCL_NAMESPACE_BEGIN
57
58 class CPUDevice;
59
60 /* Has to be outside of the class to be shared across template instantiations. */
61 static const char *logged_architecture = "";
62
63 template<typename F>
64 class KernelFunctions {
65 public:
66         KernelFunctions()
67         {
68                 kernel = (F)NULL;
69         }
70
71         KernelFunctions(F kernel_default,
72                         F kernel_sse2,
73                         F kernel_sse3,
74                         F kernel_sse41,
75                         F kernel_avx,
76                         F kernel_avx2)
77         {
78                 const char *architecture_name = "default";
79                 kernel = kernel_default;
80
81                 /* Silence potential warnings about unused variables
82                  * when compiling without some architectures. */
83                 (void)kernel_sse2;
84                 (void)kernel_sse3;
85                 (void)kernel_sse41;
86                 (void)kernel_avx;
87                 (void)kernel_avx2;
88 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
89                 if(system_cpu_support_avx2()) {
90                         architecture_name = "AVX2";
91                         kernel = kernel_avx2;
92                 }
93                 else
94 #endif
95 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
96                 if(system_cpu_support_avx()) {
97                         architecture_name = "AVX";
98                         kernel = kernel_avx;
99                 }
100                 else
101 #endif
102 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
103                 if(system_cpu_support_sse41()) {
104                         architecture_name = "SSE4.1";
105                         kernel = kernel_sse41;
106                 }
107                 else
108 #endif
109 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
110                 if(system_cpu_support_sse3()) {
111                         architecture_name = "SSE3";
112                         kernel = kernel_sse3;
113                 }
114                 else
115 #endif
116 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
117                 if(system_cpu_support_sse2()) {
118                         architecture_name = "SSE2";
119                         kernel = kernel_sse2;
120                 }
121 #endif
122
123                 if(strcmp(architecture_name, logged_architecture) != 0) {
124                         VLOG(1) << "Will be using " << architecture_name << " kernels.";
125                         logged_architecture = architecture_name;
126                 }
127         }
128
129         inline F operator()() const {
130                 assert(kernel);
131                 return kernel;
132         }
133 protected:
134         F kernel;
135 };
136
137 class CPUSplitKernel : public DeviceSplitKernel {
138         CPUDevice *device;
139 public:
140         explicit CPUSplitKernel(CPUDevice *device);
141
142         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
143                                                     RenderTile& rtile,
144                                                     int num_global_elements,
145                                                     device_memory& kernel_globals,
146                                                     device_memory& kernel_data_,
147                                                     device_memory& split_data,
148                                                     device_memory& ray_state,
149                                                     device_memory& queue_index,
150                                                     device_memory& use_queues_flag,
151                                                     device_memory& work_pool_wgs);
152
153         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
154                                                                const DeviceRequestedFeatures&);
155         virtual int2 split_kernel_local_size();
156         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
157         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
158 };
159
160 class CPUDevice : public Device
161 {
162 public:
163         TaskPool task_pool;
164         KernelGlobals kernel_globals;
165
166         device_vector<TextureInfo> texture_info;
167         bool need_texture_info;
168
169 #ifdef WITH_OSL
170         OSLGlobals osl_globals;
171 #endif
172
173         bool use_split_kernel;
174
175         DeviceRequestedFeatures requested_features;
176
177         KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)>             path_trace_kernel;
178         KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
179         KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
180         KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>   shader_kernel;
181
182         KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
183         KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int)>               filter_get_feature_kernel;
184         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
185         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
186
187         KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
188         KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
189         KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
190         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
191         KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
192
193         KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                              filter_construct_transform_kernel;
194         KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
195         KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                       filter_finalize_kernel;
196
197         KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
198                                int, int, int, int, int, int, int, int, ccl_global int*, int,
199                                ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)>        data_init_kernel;
200         unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
201
202 #define KERNEL_FUNCTIONS(name) \
203               KERNEL_NAME_EVAL(cpu, name), \
204               KERNEL_NAME_EVAL(cpu_sse2, name), \
205               KERNEL_NAME_EVAL(cpu_sse3, name), \
206               KERNEL_NAME_EVAL(cpu_sse41, name), \
207               KERNEL_NAME_EVAL(cpu_avx, name), \
208               KERNEL_NAME_EVAL(cpu_avx2, name)
209
210         CPUDevice(DeviceInfo& info_, Stats &stats_, bool background_)
211         : Device(info_, stats_, background_),
212           texture_info(this, "__texture_info", MEM_TEXTURE),
213 #define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
214           REGISTER_KERNEL(path_trace),
215           REGISTER_KERNEL(convert_to_half_float),
216           REGISTER_KERNEL(convert_to_byte),
217           REGISTER_KERNEL(shader),
218           REGISTER_KERNEL(filter_divide_shadow),
219           REGISTER_KERNEL(filter_get_feature),
220           REGISTER_KERNEL(filter_detect_outliers),
221           REGISTER_KERNEL(filter_combine_halves),
222           REGISTER_KERNEL(filter_nlm_calc_difference),
223           REGISTER_KERNEL(filter_nlm_blur),
224           REGISTER_KERNEL(filter_nlm_calc_weight),
225           REGISTER_KERNEL(filter_nlm_update_output),
226           REGISTER_KERNEL(filter_nlm_normalize),
227           REGISTER_KERNEL(filter_construct_transform),
228           REGISTER_KERNEL(filter_nlm_construct_gramian),
229           REGISTER_KERNEL(filter_finalize),
230           REGISTER_KERNEL(data_init)
231 #undef REGISTER_KERNEL
232         {
233                 if(info.cpu_threads == 0) {
234                         info.cpu_threads = TaskScheduler::num_threads();
235                 }
236
237 #ifdef WITH_OSL
238                 kernel_globals.osl = &osl_globals;
239 #endif
240                 use_split_kernel = DebugFlags().cpu.split_kernel;
241                 if(use_split_kernel) {
242                         VLOG(1) << "Will be using split kernel.";
243                 }
244                 need_texture_info = false;
245
246 #define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
247                 REGISTER_SPLIT_KERNEL(path_init);
248                 REGISTER_SPLIT_KERNEL(scene_intersect);
249                 REGISTER_SPLIT_KERNEL(lamp_emission);
250                 REGISTER_SPLIT_KERNEL(do_volume);
251                 REGISTER_SPLIT_KERNEL(queue_enqueue);
252                 REGISTER_SPLIT_KERNEL(indirect_background);
253                 REGISTER_SPLIT_KERNEL(shader_setup);
254                 REGISTER_SPLIT_KERNEL(shader_sort);
255                 REGISTER_SPLIT_KERNEL(shader_eval);
256                 REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
257                 REGISTER_SPLIT_KERNEL(subsurface_scatter);
258                 REGISTER_SPLIT_KERNEL(direct_lighting);
259                 REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
260                 REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
261                 REGISTER_SPLIT_KERNEL(enqueue_inactive);
262                 REGISTER_SPLIT_KERNEL(next_iteration_setup);
263                 REGISTER_SPLIT_KERNEL(indirect_subsurface);
264                 REGISTER_SPLIT_KERNEL(buffer_update);
265 #undef REGISTER_SPLIT_KERNEL
266 #undef KERNEL_FUNCTIONS
267         }
268
269         ~CPUDevice()
270         {
271                 task_pool.stop();
272                 texture_info.free();
273         }
274
275         virtual bool show_samples() const
276         {
277                 return (info.cpu_threads == 1);
278         }
279
280         void load_texture_info()
281         {
282                 if(need_texture_info) {
283                         texture_info.copy_to_device();
284                         need_texture_info = false;
285                 }
286         }
287
288         void mem_alloc(device_memory& mem)
289         {
290                 if(mem.type == MEM_TEXTURE) {
291                         assert(!"mem_alloc not supported for textures.");
292                 }
293                 else {
294                         if(mem.name) {
295                                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
296                                                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
297                                                 << string_human_readable_size(mem.memory_size()) << ")";
298                         }
299
300                         if(mem.type == MEM_DEVICE_ONLY) {
301                                 assert(!mem.host_pointer);
302                                 size_t alignment = mem_address_alignment();
303                                 void *data = util_aligned_malloc(mem.memory_size(), alignment);
304                                 mem.device_pointer = (device_ptr)data;
305                         }
306                         else {
307                                 mem.device_pointer = (device_ptr)mem.host_pointer;
308                         }
309
310                         mem.device_size = mem.memory_size();
311                         stats.mem_alloc(mem.device_size);
312                 }
313         }
314
315         void mem_copy_to(device_memory& mem)
316         {
317                 if(mem.type == MEM_TEXTURE) {
318                         tex_free(mem);
319                         tex_alloc(mem);
320                 }
321                 else if(mem.type == MEM_PIXELS) {
322                         assert(!"mem_copy_to not supported for pixels.");
323                 }
324                 else {
325                         if(!mem.device_pointer) {
326                                 mem_alloc(mem);
327                         }
328
329                         /* copy is no-op */
330                 }
331         }
332
333         void mem_copy_from(device_memory& /*mem*/,
334                            int /*y*/, int /*w*/, int /*h*/,
335                            int /*elem*/)
336         {
337                 /* no-op */
338         }
339
340         void mem_zero(device_memory& mem)
341         {
342                 if(!mem.device_pointer) {
343                         mem_alloc(mem);
344                 }
345
346                 if(mem.device_pointer) {
347                         memset((void*)mem.device_pointer, 0, mem.memory_size());
348                 }
349         }
350
351         void mem_free(device_memory& mem)
352         {
353                 if(mem.type == MEM_TEXTURE) {
354                         tex_free(mem);
355                 }
356                 else if(mem.device_pointer) {
357                         if(mem.type == MEM_DEVICE_ONLY) {
358                                 util_aligned_free((void*)mem.device_pointer);
359                         }
360                         mem.device_pointer = 0;
361                         stats.mem_free(mem.device_size);
362                         mem.device_size = 0;
363                 }
364         }
365
366         virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
367         {
368                 return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
369         }
370
371         void const_copy_to(const char *name, void *host, size_t size)
372         {
373                 kernel_const_copy(&kernel_globals, name, host, size);
374         }
375
376         void tex_alloc(device_memory& mem)
377         {
378                 VLOG(1) << "Texture allocate: " << mem.name << ", "
379                         << string_human_readable_number(mem.memory_size()) << " bytes. ("
380                         << string_human_readable_size(mem.memory_size()) << ")";
381
382                 if(mem.interpolation == INTERPOLATION_NONE) {
383                         /* Data texture. */
384                         kernel_tex_copy(&kernel_globals,
385                                                         mem.name,
386                                                         mem.host_pointer,
387                                                         mem.data_size);
388                 }
389                 else {
390                         /* Image Texture. */
391                         int flat_slot = 0;
392                         if(string_startswith(mem.name, "__tex_image")) {
393                                 int pos =  string(mem.name).rfind("_");
394                                 flat_slot = atoi(mem.name + pos + 1);
395                         }
396                         else {
397                                 assert(0);
398                         }
399
400                         if(flat_slot >= texture_info.size()) {
401                                 /* Allocate some slots in advance, to reduce amount
402                                  * of re-allocations. */
403                                 texture_info.resize(flat_slot + 128);
404                         }
405
406                         TextureInfo& info = texture_info[flat_slot];
407                         info.data = (uint64_t)mem.host_pointer;
408                         info.cl_buffer = 0;
409                         info.interpolation = mem.interpolation;
410                         info.extension = mem.extension;
411                         info.width = mem.data_width;
412                         info.height = mem.data_height;
413                         info.depth = mem.data_depth;
414
415                         need_texture_info = true;
416                 }
417
418                 mem.device_pointer = (device_ptr)mem.host_pointer;
419                 mem.device_size = mem.memory_size();
420                 stats.mem_alloc(mem.device_size);
421         }
422
423         void tex_free(device_memory& mem)
424         {
425                 if(mem.device_pointer) {
426                         mem.device_pointer = 0;
427                         stats.mem_free(mem.device_size);
428                         mem.device_size = 0;
429                         need_texture_info = true;
430                 }
431         }
432
433         void *osl_memory()
434         {
435 #ifdef WITH_OSL
436                 return &osl_globals;
437 #else
438                 return NULL;
439 #endif
440         }
441
442         void thread_run(DeviceTask *task)
443         {
444                 if(task->type == DeviceTask::RENDER) {
445                         thread_render(*task);
446                 }
447                 else if(task->type == DeviceTask::FILM_CONVERT)
448                         thread_film_convert(*task);
449                 else if(task->type == DeviceTask::SHADER)
450                         thread_shader(*task);
451         }
452
453         class CPUDeviceTask : public DeviceTask {
454         public:
455                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
456                 : DeviceTask(task)
457                 {
458                         run = function_bind(&CPUDevice::thread_run, device, this);
459                 }
460         };
461
462         bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
463         {
464                 TilesInfo *tiles = (TilesInfo*) task->tiles_mem.host_pointer;
465                 for(int i = 0; i < 9; i++) {
466                         tiles->buffers[i] = buffers[i];
467                 }
468
469                 task->tiles_mem.copy_to_device();
470
471                 return true;
472         }
473
474         bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
475                                        DenoisingTask *task)
476         {
477                 int4 rect = task->rect;
478                 int   r   = task->nlm_state.r;
479                 int   f   = task->nlm_state.f;
480                 float a   = task->nlm_state.a;
481                 float k_2 = task->nlm_state.k_2;
482
483                 int w = align_up(rect.z-rect.x, 4);
484                 int h = rect.w-rect.y;
485
486                 float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
487                 float *difference     = (float*) task->nlm_state.temporary_2_ptr;
488                 float *weightAccum    = (float*) task->nlm_state.temporary_3_ptr;
489
490                 memset(weightAccum, 0, sizeof(float)*w*h);
491                 memset((float*) out_ptr, 0, sizeof(float)*w*h);
492
493                 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
494                         int dy = i / (2*r+1) - r;
495                         int dx = i % (2*r+1) - r;
496
497                         int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
498                         filter_nlm_calc_difference_kernel()(dx, dy,
499                                                             (float*) guide_ptr,
500                                                             (float*) variance_ptr,
501                                                             difference,
502                                                             local_rect,
503                                                             w, 0,
504                                                             a, k_2);
505
506                         filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
507                         filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
508                         filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
509
510                         filter_nlm_update_output_kernel()(dx, dy,
511                                                           blurDifference,
512                                                           (float*) image_ptr,
513                                                           (float*) out_ptr,
514                                                           weightAccum,
515                                                           local_rect,
516                                                           w, f);
517                 }
518
519                 int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
520                 filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
521
522                 return true;
523         }
524
525         bool denoising_construct_transform(DenoisingTask *task)
526         {
527                 for(int y = 0; y < task->filter_area.w; y++) {
528                         for(int x = 0; x < task->filter_area.z; x++) {
529                                 filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
530                                                                     x + task->filter_area.x,
531                                                                     y + task->filter_area.y,
532                                                                     y*task->filter_area.z + x,
533                                                                     (float*) task->storage.transform.device_pointer,
534                                                                     (int*)   task->storage.rank.device_pointer,
535                                                                     &task->rect.x,
536                                                                     task->buffer.pass_stride,
537                                                                     task->radius,
538                                                                     task->pca_threshold);
539                         }
540                 }
541                 return true;
542         }
543
544         bool denoising_reconstruct(device_ptr color_ptr,
545                                    device_ptr color_variance_ptr,
546                                    device_ptr output_ptr,
547                                    DenoisingTask *task)
548         {
549                 mem_zero(task->storage.XtWX);
550                 mem_zero(task->storage.XtWY);
551
552                 float *difference     = (float*) task->reconstruction_state.temporary_1_ptr;
553                 float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
554
555                 int r = task->radius;
556                 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
557                         int dy = i / (2*r+1) - r;
558                         int dx = i % (2*r+1) - r;
559
560                         int local_rect[4] = {max(0, -dx), max(0, -dy),
561                                              task->reconstruction_state.source_w - max(0, dx),
562                                              task->reconstruction_state.source_h - max(0, dy)};
563                         filter_nlm_calc_difference_kernel()(dx, dy,
564                                                             (float*) color_ptr,
565                                                             (float*) color_variance_ptr,
566                                                             difference,
567                                                             local_rect,
568                                                             task->buffer.w,
569                                                             task->buffer.pass_stride,
570                                                             1.0f,
571                                                             task->nlm_k_2);
572                         filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
573                         filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
574                         filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
575                         filter_nlm_construct_gramian_kernel()(dx, dy,
576                                                               blurDifference,
577                                                               (float*)  task->buffer.mem.device_pointer,
578                                                               (float*)  task->storage.transform.device_pointer,
579                                                               (int*)    task->storage.rank.device_pointer,
580                                                               (float*)  task->storage.XtWX.device_pointer,
581                                                               (float3*) task->storage.XtWY.device_pointer,
582                                                               local_rect,
583                                                               &task->reconstruction_state.filter_rect.x,
584                                                               task->buffer.w,
585                                                               task->buffer.h,
586                                                               4,
587                                                               task->buffer.pass_stride);
588                 }
589                 for(int y = 0; y < task->filter_area.w; y++) {
590                         for(int x = 0; x < task->filter_area.z; x++) {
591                                 filter_finalize_kernel()(x,
592                                                          y,
593                                                          y*task->filter_area.z + x,
594                                                          task->buffer.w,
595                                                          task->buffer.h,
596                                                          (float*)  output_ptr,
597                                                          (int*)    task->storage.rank.device_pointer,
598                                                          (float*)  task->storage.XtWX.device_pointer,
599                                                          (float3*) task->storage.XtWY.device_pointer,
600                                                          &task->reconstruction_state.buffer_params.x,
601                                                          task->render_buffer.samples);
602                         }
603                 }
604                 return true;
605         }
606
607         bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
608                                       device_ptr mean_ptr, device_ptr variance_ptr,
609                                       int r, int4 rect, DenoisingTask * /*task*/)
610         {
611                 for(int y = rect.y; y < rect.w; y++) {
612                         for(int x = rect.x; x < rect.z; x++) {
613                                 filter_combine_halves_kernel()(x, y,
614                                                                (float*) mean_ptr,
615                                                                (float*) variance_ptr,
616                                                                (float*) a_ptr,
617                                                                (float*) b_ptr,
618                                                                &rect.x,
619                                                                r);
620                         }
621                 }
622                 return true;
623         }
624
625         bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
626                                      device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
627                                      device_ptr buffer_variance_ptr, DenoisingTask *task)
628         {
629                 for(int y = task->rect.y; y < task->rect.w; y++) {
630                         for(int x = task->rect.x; x < task->rect.z; x++) {
631                                 filter_divide_shadow_kernel()(task->render_buffer.samples,
632                                                               task->tiles,
633                                                               x, y,
634                                                               (float*) a_ptr,
635                                                               (float*) b_ptr,
636                                                               (float*) sample_variance_ptr,
637                                                               (float*) sv_variance_ptr,
638                                                               (float*) buffer_variance_ptr,
639                                                               &task->rect.x,
640                                                               task->render_buffer.pass_stride,
641                                                               task->render_buffer.denoising_data_offset);
642                         }
643                 }
644                 return true;
645         }
646
647         bool denoising_get_feature(int mean_offset,
648                                    int variance_offset,
649                                    device_ptr mean_ptr,
650                                    device_ptr variance_ptr,
651                                    DenoisingTask *task)
652         {
653                 for(int y = task->rect.y; y < task->rect.w; y++) {
654                         for(int x = task->rect.x; x < task->rect.z; x++) {
655                                 filter_get_feature_kernel()(task->render_buffer.samples,
656                                                             task->tiles,
657                                                             mean_offset,
658                                                             variance_offset,
659                                                             x, y,
660                                                             (float*) mean_ptr,
661                                                             (float*) variance_ptr,
662                                                             &task->rect.x,
663                                                             task->render_buffer.pass_stride,
664                                                             task->render_buffer.denoising_data_offset);
665                         }
666                 }
667                 return true;
668         }
669
670         bool denoising_detect_outliers(device_ptr image_ptr,
671                                        device_ptr variance_ptr,
672                                        device_ptr depth_ptr,
673                                        device_ptr output_ptr,
674                                        DenoisingTask *task)
675         {
676                 for(int y = task->rect.y; y < task->rect.w; y++) {
677                         for(int x = task->rect.x; x < task->rect.z; x++) {
678                                 filter_detect_outliers_kernel()(x, y,
679                                                                 (float*) image_ptr,
680                                                                 (float*) variance_ptr,
681                                                                 (float*) depth_ptr,
682                                                                 (float*) output_ptr,
683                                                                 &task->rect.x,
684                                                                 task->buffer.pass_stride);
685                         }
686                 }
687                 return true;
688         }
689
690         void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
691         {
692                 float *render_buffer = (float*)tile.buffer;
693                 int start_sample = tile.start_sample;
694                 int end_sample = tile.start_sample + tile.num_samples;
695
696                 for(int sample = start_sample; sample < end_sample; sample++) {
697                         if(task.get_cancel() || task_pool.canceled()) {
698                                 if(task.need_finish_queue == false)
699                                         break;
700                         }
701
702                         for(int y = tile.y; y < tile.y + tile.h; y++) {
703                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
704                                         path_trace_kernel()(kg, render_buffer,
705                                                             sample, x, y, tile.offset, tile.stride);
706                                 }
707                         }
708
709                         tile.sample = sample + 1;
710
711                         task.update_progress(&tile, tile.w*tile.h);
712                 }
713         }
714
715         void denoise(DeviceTask &task, RenderTile &tile)
716         {
717                 tile.sample = tile.start_sample + tile.num_samples;
718
719                 DenoisingTask denoising(this);
720
721                 denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
722                 denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
723                 denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
724                 denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
725                 denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
726                 denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
727                 denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
728                 denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
729
730                 denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
731                 denoising.render_buffer.samples = tile.sample;
732
733                 RenderTile rtiles[9];
734                 rtiles[4] = tile;
735                 task.map_neighbor_tiles(rtiles, this);
736                 denoising.tiles_from_rendertiles(rtiles);
737
738                 denoising.init_from_devicetask(task);
739
740                 denoising.run_denoising();
741
742                 task.unmap_neighbor_tiles(rtiles, this);
743
744                 task.update_progress(&tile, tile.w*tile.h);
745         }
746
747         void thread_render(DeviceTask& task)
748         {
749                 if(task_pool.canceled()) {
750                         if(task.need_finish_queue == false)
751                                 return;
752                 }
753
754                 /* allocate buffer for kernel globals */
755                 device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
756                 kgbuffer.alloc_to_device(1);
757
758                 KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
759
760                 CPUSplitKernel *split_kernel = NULL;
761                 if(use_split_kernel) {
762                         split_kernel = new CPUSplitKernel(this);
763                         if(!split_kernel->load_kernels(requested_features)) {
764                                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
765                                 kgbuffer.free();
766                                 delete split_kernel;
767                                 return;
768                         }
769                 }
770
771                 RenderTile tile;
772                 while(task.acquire_tile(this, tile)) {
773                         if(tile.task == RenderTile::PATH_TRACE) {
774                                 if(use_split_kernel) {
775                                         device_only_memory<uchar> void_buffer(this, "void_buffer");
776                                         split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
777                                 }
778                                 else {
779                                         path_trace(task, tile, kg);
780                                 }
781                         }
782                         else if(tile.task == RenderTile::DENOISE) {
783                                 denoise(task, tile);
784                         }
785
786                         task.release_tile(tile);
787
788                         if(task_pool.canceled()) {
789                                 if(task.need_finish_queue == false)
790                                         break;
791                         }
792                 }
793
794                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
795                 kg->~KernelGlobals();
796                 kgbuffer.free();
797                 delete split_kernel;
798         }
799
800         void thread_film_convert(DeviceTask& task)
801         {
802                 float sample_scale = 1.0f/(task.sample + 1);
803
804                 if(task.rgba_half) {
805                         for(int y = task.y; y < task.y + task.h; y++)
806                                 for(int x = task.x; x < task.x + task.w; x++)
807                                         convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
808                                                                        sample_scale, x, y, task.offset, task.stride);
809                 }
810                 else {
811                         for(int y = task.y; y < task.y + task.h; y++)
812                                 for(int x = task.x; x < task.x + task.w; x++)
813                                         convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
814                                                                  sample_scale, x, y, task.offset, task.stride);
815
816                 }
817         }
818
819         void thread_shader(DeviceTask& task)
820         {
821                 KernelGlobals kg = kernel_globals;
822
823 #ifdef WITH_OSL
824                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
825 #endif
826                 for(int sample = 0; sample < task.num_samples; sample++) {
827                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
828                                 shader_kernel()(&kg,
829                                                 (uint4*)task.shader_input,
830                                                 (float4*)task.shader_output,
831                                                 task.shader_eval_type,
832                                                 task.shader_filter,
833                                                 x,
834                                                 task.offset,
835                                                 sample);
836
837                         if(task.get_cancel() || task_pool.canceled())
838                                 break;
839
840                         task.update_progress(NULL);
841
842                 }
843
844 #ifdef WITH_OSL
845                 OSLShader::thread_free(&kg);
846 #endif
847         }
848
849         int get_split_task_count(DeviceTask& task)
850         {
851                 if(task.type == DeviceTask::SHADER)
852                         return task.get_subtask_count(info.cpu_threads, 256);
853                 else
854                         return task.get_subtask_count(info.cpu_threads);
855         }
856
857         void task_add(DeviceTask& task)
858         {
859                 /* Load texture info. */
860                 load_texture_info();
861
862                 /* split task into smaller ones */
863                 list<DeviceTask> tasks;
864
865                 if(task.type == DeviceTask::SHADER)
866                         task.split(tasks, info.cpu_threads, 256);
867                 else
868                         task.split(tasks, info.cpu_threads);
869
870                 foreach(DeviceTask& task, tasks)
871                         task_pool.push(new CPUDeviceTask(this, task));
872         }
873
874         void task_wait()
875         {
876                 task_pool.wait_work();
877         }
878
879         void task_cancel()
880         {
881                 task_pool.cancel();
882         }
883
884 protected:
885         inline KernelGlobals thread_kernel_globals_init()
886         {
887                 KernelGlobals kg = kernel_globals;
888                 kg.transparent_shadow_intersections = NULL;
889                 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
890                                             sizeof(*kg.decoupled_volume_steps);
891                 for(int i = 0; i < decoupled_count; ++i) {
892                         kg.decoupled_volume_steps[i] = NULL;
893                 }
894                 kg.decoupled_volume_steps_index = 0;
895 #ifdef WITH_OSL
896                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
897 #endif
898                 return kg;
899         }
900
901         inline void thread_kernel_globals_free(KernelGlobals *kg)
902         {
903                 if(kg == NULL) {
904                         return;
905                 }
906
907                 if(kg->transparent_shadow_intersections != NULL) {
908                         free(kg->transparent_shadow_intersections);
909                 }
910                 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
911                                             sizeof(*kg->decoupled_volume_steps);
912                 for(int i = 0; i < decoupled_count; ++i) {
913                         if(kg->decoupled_volume_steps[i] != NULL) {
914                                 free(kg->decoupled_volume_steps[i]);
915                         }
916                 }
917 #ifdef WITH_OSL
918                 OSLShader::thread_free(kg);
919 #endif
920         }
921
922         virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
923                 requested_features = requested_features_;
924
925                 return true;
926         }
927 };
928
929 /* split kernel */
930
931 class CPUSplitKernelFunction : public SplitKernelFunction {
932 public:
933         CPUDevice* device;
934         void (*func)(KernelGlobals *kg, KernelData *data);
935
936         CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
937         ~CPUSplitKernelFunction() {}
938
939         virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
940         {
941                 if(!func) {
942                         return false;
943                 }
944
945                 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
946                 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
947
948                 for(int y = 0; y < dim.global_size[1]; y++) {
949                         for(int x = 0; x < dim.global_size[0]; x++) {
950                                 kg->global_id = make_int2(x, y);
951
952                                 func(kg, (KernelData*)data.device_pointer);
953                         }
954                 }
955
956                 return true;
957         }
958 };
959
960 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
961 {
962 }
963
964 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
965                                                     RenderTile& rtile,
966                                                     int num_global_elements,
967                                                     device_memory& kernel_globals,
968                                                     device_memory& data,
969                                                     device_memory& split_data,
970                                                     device_memory& ray_state,
971                                                     device_memory& queue_index,
972                                                     device_memory& use_queues_flags,
973                                                     device_memory& work_pool_wgs)
974 {
975         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
976         kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
977
978         for(int y = 0; y < dim.global_size[1]; y++) {
979                 for(int x = 0; x < dim.global_size[0]; x++) {
980                         kg->global_id = make_int2(x, y);
981
982                         device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
983                                                    (KernelData*)data.device_pointer,
984                                                    (void*)split_data.device_pointer,
985                                                    num_global_elements,
986                                                    (char*)ray_state.device_pointer,
987                                                    rtile.start_sample,
988                                                    rtile.start_sample + rtile.num_samples,
989                                                    rtile.x,
990                                                    rtile.y,
991                                                    rtile.w,
992                                                    rtile.h,
993                                                    rtile.offset,
994                                                    rtile.stride,
995                                                    (int*)queue_index.device_pointer,
996                                                    dim.global_size[0] * dim.global_size[1],
997                                                    (char*)use_queues_flags.device_pointer,
998                                                    (uint*)work_pool_wgs.device_pointer,
999                                                    rtile.num_samples,
1000                                                    (float*)rtile.buffer);
1001                 }
1002         }
1003
1004         return true;
1005 }
1006
1007 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
1008                                                                const DeviceRequestedFeatures&)
1009 {
1010         CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
1011
1012         kernel->func = device->split_kernels[kernel_name]();
1013         if(!kernel->func) {
1014                 delete kernel;
1015                 return NULL;
1016         }
1017
1018         return kernel;
1019 }
1020
1021 int2 CPUSplitKernel::split_kernel_local_size()
1022 {
1023         return make_int2(1, 1);
1024 }
1025
1026 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
1027         return make_int2(1, 1);
1028 }
1029
1030 uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
1031         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
1032
1033         return split_data_buffer_size(kg, num_threads);
1034 }
1035
1036 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
1037 {
1038         return new CPUDevice(info, stats, background);
1039 }
1040
1041 void device_cpu_info(vector<DeviceInfo>& devices)
1042 {
1043         DeviceInfo info;
1044
1045         info.type = DEVICE_CPU;
1046         info.description = system_cpu_brand_string();
1047         info.id = "CPU";
1048         info.num = 0;
1049         info.advanced_shading = true;
1050         info.has_qbvh = system_cpu_support_sse2();
1051         info.has_volume_decoupled = true;
1052         info.has_osl = true;
1053         info.has_half_images = true;
1054
1055         devices.insert(devices.begin(), info);
1056 }
1057
1058 string device_cpu_capabilities(void)
1059 {
1060         string capabilities = "";
1061         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
1062         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
1063         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
1064         capabilities += system_cpu_support_avx() ? "AVX " : "";
1065         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
1066         if(capabilities[capabilities.size() - 1] == ' ')
1067                 capabilities.resize(capabilities.size() - 1);
1068         return capabilities;
1069 }
1070
1071 CCL_NAMESPACE_END