Merge branch 'master' into blender2.8
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util/util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device/device.h"
28 #include "device/device_denoising.h"
29 #include "device/device_intern.h"
30 #include "device/device_split_kernel.h"
31
32 #include "kernel/kernel.h"
33 #include "kernel/kernel_compat_cpu.h"
34 #include "kernel/kernel_types.h"
35 #include "kernel/split/kernel_split_data.h"
36 #include "kernel/kernel_globals.h"
37
38 #include "kernel/filter/filter.h"
39
40 #include "kernel/osl/osl_shader.h"
41 #include "kernel/osl/osl_globals.h"
42
43 #include "render/buffers.h"
44
45 #include "util/util_debug.h"
46 #include "util/util_foreach.h"
47 #include "util/util_function.h"
48 #include "util/util_logging.h"
49 #include "util/util_map.h"
50 #include "util/util_opengl.h"
51 #include "util/util_progress.h"
52 #include "util/util_system.h"
53 #include "util/util_thread.h"
54
55 CCL_NAMESPACE_BEGIN
56
57 class CPUDevice;
58
59 /* Has to be outside of the class to be shared across template instantiations. */
60 static const char *logged_architecture = "";
61
62 template<typename F>
63 class KernelFunctions {
64 public:
65         KernelFunctions()
66         {
67                 kernel = (F)NULL;
68         }
69
70         KernelFunctions(F kernel_default,
71                         F kernel_sse2,
72                         F kernel_sse3,
73                         F kernel_sse41,
74                         F kernel_avx,
75                         F kernel_avx2)
76         {
77                 const char *architecture_name = "default";
78                 kernel = kernel_default;
79
80                 /* Silence potential warnings about unused variables
81                  * when compiling without some architectures. */
82                 (void)kernel_sse2;
83                 (void)kernel_sse3;
84                 (void)kernel_sse41;
85                 (void)kernel_avx;
86                 (void)kernel_avx2;
87 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
88                 if(system_cpu_support_avx2()) {
89                         architecture_name = "AVX2";
90                         kernel = kernel_avx2;
91                 }
92                 else
93 #endif
94 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
95                 if(system_cpu_support_avx()) {
96                         architecture_name = "AVX";
97                         kernel = kernel_avx;
98                 }
99                 else
100 #endif
101 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
102                 if(system_cpu_support_sse41()) {
103                         architecture_name = "SSE4.1";
104                         kernel = kernel_sse41;
105                 }
106                 else
107 #endif
108 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
109                 if(system_cpu_support_sse3()) {
110                         architecture_name = "SSE3";
111                         kernel = kernel_sse3;
112                 }
113                 else
114 #endif
115 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
116                 if(system_cpu_support_sse2()) {
117                         architecture_name = "SSE2";
118                         kernel = kernel_sse2;
119                 }
120 #endif
121
122                 if(strstr(architecture_name, logged_architecture) != 0) {
123                         VLOG(1) << "Will be using " << architecture_name << " kernels.";
124                         logged_architecture = architecture_name;
125                 }
126         }
127
128         inline F operator()() const {
129                 assert(kernel);
130                 return kernel;
131         }
132 protected:
133         F kernel;
134 };
135
136 class CPUSplitKernel : public DeviceSplitKernel {
137         CPUDevice *device;
138 public:
139         explicit CPUSplitKernel(CPUDevice *device);
140
141         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
142                                                     RenderTile& rtile,
143                                                     int num_global_elements,
144                                                     device_memory& kernel_globals,
145                                                     device_memory& kernel_data_,
146                                                     device_memory& split_data,
147                                                     device_memory& ray_state,
148                                                     device_memory& queue_index,
149                                                     device_memory& use_queues_flag,
150                                                     device_memory& work_pool_wgs);
151
152         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
153                                                                const DeviceRequestedFeatures&);
154         virtual int2 split_kernel_local_size();
155         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
156         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
157 };
158
159 class CPUDevice : public Device
160 {
161 public:
162         TaskPool task_pool;
163         KernelGlobals kernel_globals;
164
165 #ifdef WITH_OSL
166         OSLGlobals osl_globals;
167 #endif
168
169         bool use_split_kernel;
170
171         DeviceRequestedFeatures requested_features;
172
173         KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)>   path_trace_kernel;
174         KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_half_float_kernel;
175         KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_byte_kernel;
176         KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
177
178         KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
179         KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)>               filter_get_feature_kernel;
180         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_detect_outliers_kernel;
181         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_combine_halves_kernel;
182
183         KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
184         KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
185         KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
186         KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
187         KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
188
189         KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                              filter_construct_transform_kernel;
190         KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
191         KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                       filter_finalize_kernel;
192
193         KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
194                                ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
195                                ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)>        data_init_kernel;
196         unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
197
198 #define KERNEL_FUNCTIONS(name) \
199               KERNEL_NAME_EVAL(cpu, name), \
200               KERNEL_NAME_EVAL(cpu_sse2, name), \
201               KERNEL_NAME_EVAL(cpu_sse3, name), \
202               KERNEL_NAME_EVAL(cpu_sse41, name), \
203               KERNEL_NAME_EVAL(cpu_avx, name), \
204               KERNEL_NAME_EVAL(cpu_avx2, name)
205
206         CPUDevice(DeviceInfo& info, Stats &stats, bool background)
207         : Device(info, stats, background),
208 #define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
209           REGISTER_KERNEL(path_trace),
210           REGISTER_KERNEL(convert_to_half_float),
211           REGISTER_KERNEL(convert_to_byte),
212           REGISTER_KERNEL(shader),
213           REGISTER_KERNEL(filter_divide_shadow),
214           REGISTER_KERNEL(filter_get_feature),
215           REGISTER_KERNEL(filter_detect_outliers),
216           REGISTER_KERNEL(filter_combine_halves),
217           REGISTER_KERNEL(filter_nlm_calc_difference),
218           REGISTER_KERNEL(filter_nlm_blur),
219           REGISTER_KERNEL(filter_nlm_calc_weight),
220           REGISTER_KERNEL(filter_nlm_update_output),
221           REGISTER_KERNEL(filter_nlm_normalize),
222           REGISTER_KERNEL(filter_construct_transform),
223           REGISTER_KERNEL(filter_nlm_construct_gramian),
224           REGISTER_KERNEL(filter_finalize),
225           REGISTER_KERNEL(data_init)
226 #undef REGISTER_KERNEL
227         {
228
229 #ifdef WITH_OSL
230                 kernel_globals.osl = &osl_globals;
231 #endif
232                 use_split_kernel = DebugFlags().cpu.split_kernel;
233                 if(use_split_kernel) {
234                         VLOG(1) << "Will be using split kernel.";
235                 }
236
237 #define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
238                 REGISTER_SPLIT_KERNEL(path_init);
239                 REGISTER_SPLIT_KERNEL(scene_intersect);
240                 REGISTER_SPLIT_KERNEL(lamp_emission);
241                 REGISTER_SPLIT_KERNEL(do_volume);
242                 REGISTER_SPLIT_KERNEL(queue_enqueue);
243                 REGISTER_SPLIT_KERNEL(indirect_background);
244                 REGISTER_SPLIT_KERNEL(shader_setup);
245                 REGISTER_SPLIT_KERNEL(shader_sort);
246                 REGISTER_SPLIT_KERNEL(shader_eval);
247                 REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
248                 REGISTER_SPLIT_KERNEL(subsurface_scatter);
249                 REGISTER_SPLIT_KERNEL(direct_lighting);
250                 REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
251                 REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
252                 REGISTER_SPLIT_KERNEL(enqueue_inactive);
253                 REGISTER_SPLIT_KERNEL(next_iteration_setup);
254                 REGISTER_SPLIT_KERNEL(indirect_subsurface);
255                 REGISTER_SPLIT_KERNEL(buffer_update);
256 #undef REGISTER_SPLIT_KERNEL
257 #undef KERNEL_FUNCTIONS
258         }
259
260         ~CPUDevice()
261         {
262                 task_pool.stop();
263         }
264
265         virtual bool show_samples() const
266         {
267                 return (TaskScheduler::num_threads() == 1);
268         }
269
270         void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
271         {
272                 if(name) {
273                         VLOG(1) << "Buffer allocate: " << name << ", "
274                                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
275                                 << string_human_readable_size(mem.memory_size()) << ")";
276                 }
277
278                 mem.device_pointer = mem.data_pointer;
279
280                 if(!mem.device_pointer) {
281                         mem.device_pointer = (device_ptr)malloc(mem.memory_size());
282                 }
283
284                 mem.device_size = mem.memory_size();
285                 stats.mem_alloc(mem.device_size);
286         }
287
288         void mem_copy_to(device_memory& /*mem*/)
289         {
290                 /* no-op */
291         }
292
293         void mem_copy_from(device_memory& /*mem*/,
294                            int /*y*/, int /*w*/, int /*h*/,
295                            int /*elem*/)
296         {
297                 /* no-op */
298         }
299
300         void mem_zero(device_memory& mem)
301         {
302                 memset((void*)mem.device_pointer, 0, mem.memory_size());
303         }
304
305         void mem_free(device_memory& mem)
306         {
307                 if(mem.device_pointer) {
308                         if(!mem.data_pointer) {
309                                 free((void*)mem.device_pointer);
310                         }
311                         mem.device_pointer = 0;
312                         stats.mem_free(mem.device_size);
313                         mem.device_size = 0;
314                 }
315         }
316
317         virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
318         {
319                 return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
320         }
321
322         void const_copy_to(const char *name, void *host, size_t size)
323         {
324                 kernel_const_copy(&kernel_globals, name, host, size);
325         }
326
327         void tex_alloc(const char *name,
328                        device_memory& mem,
329                        InterpolationType interpolation,
330                        ExtensionType extension)
331         {
332                 VLOG(1) << "Texture allocate: " << name << ", "
333                         << string_human_readable_number(mem.memory_size()) << " bytes. ("
334                         << string_human_readable_size(mem.memory_size()) << ")";
335                 kernel_tex_copy(&kernel_globals,
336                                 name,
337                                 mem.data_pointer,
338                                 mem.data_width,
339                                 mem.data_height,
340                                 mem.data_depth,
341                                 interpolation,
342                                 extension);
343                 mem.device_pointer = mem.data_pointer;
344                 mem.device_size = mem.memory_size();
345                 stats.mem_alloc(mem.device_size);
346         }
347
348         void tex_free(device_memory& mem)
349         {
350                 if(mem.device_pointer) {
351                         mem.device_pointer = 0;
352                         stats.mem_free(mem.device_size);
353                         mem.device_size = 0;
354                 }
355         }
356
357         void *osl_memory()
358         {
359 #ifdef WITH_OSL
360                 return &osl_globals;
361 #else
362                 return NULL;
363 #endif
364         }
365
366         void thread_run(DeviceTask *task)
367         {
368                 if(task->type == DeviceTask::RENDER) {
369                         thread_render(*task);
370                 }
371                 else if(task->type == DeviceTask::FILM_CONVERT)
372                         thread_film_convert(*task);
373                 else if(task->type == DeviceTask::SHADER)
374                         thread_shader(*task);
375         }
376
377         class CPUDeviceTask : public DeviceTask {
378         public:
379                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
380                 : DeviceTask(task)
381                 {
382                         run = function_bind(&CPUDevice::thread_run, device, this);
383                 }
384         };
385
386         bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
387         {
388                 mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
389
390                 TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
391                 for(int i = 0; i < 9; i++) {
392                         tiles->buffers[i] = buffers[i];
393                 }
394
395                 return true;
396         }
397
398         bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
399                                        DenoisingTask *task)
400         {
401                 int4 rect = task->rect;
402                 int   r   = task->nlm_state.r;
403                 int   f   = task->nlm_state.f;
404                 float a   = task->nlm_state.a;
405                 float k_2 = task->nlm_state.k_2;
406
407                 int w = align_up(rect.z-rect.x, 4);
408                 int h = rect.w-rect.y;
409
410                 float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
411                 float *difference     = (float*) task->nlm_state.temporary_2_ptr;
412                 float *weightAccum    = (float*) task->nlm_state.temporary_3_ptr;
413
414                 memset(weightAccum, 0, sizeof(float)*w*h);
415                 memset((float*) out_ptr, 0, sizeof(float)*w*h);
416
417                 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
418                         int dy = i / (2*r+1) - r;
419                         int dx = i % (2*r+1) - r;
420
421                         int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
422                         filter_nlm_calc_difference_kernel()(dx, dy,
423                                                             (float*) guide_ptr,
424                                                             (float*) variance_ptr,
425                                                             difference,
426                                                             local_rect,
427                                                             w, 0,
428                                                             a, k_2);
429
430                         filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
431                         filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
432                         filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
433
434                         filter_nlm_update_output_kernel()(dx, dy,
435                                                           blurDifference,
436                                                           (float*) image_ptr,
437                                                           (float*) out_ptr,
438                                                           weightAccum,
439                                                           local_rect,
440                                                           w, f);
441                 }
442
443                 int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
444                 filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
445
446                 return true;
447         }
448
449         bool denoising_construct_transform(DenoisingTask *task)
450         {
451                 for(int y = 0; y < task->filter_area.w; y++) {
452                         for(int x = 0; x < task->filter_area.z; x++) {
453                                 filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
454                                                                     x + task->filter_area.x,
455                                                                     y + task->filter_area.y,
456                                                                     y*task->filter_area.z + x,
457                                                                     (float*) task->storage.transform.device_pointer,
458                                                                     (int*)   task->storage.rank.device_pointer,
459                                                                     &task->rect.x,
460                                                                     task->buffer.pass_stride,
461                                                                     task->radius,
462                                                                     task->pca_threshold);
463                         }
464                 }
465                 return true;
466         }
467
468         bool denoising_reconstruct(device_ptr color_ptr,
469                                    device_ptr color_variance_ptr,
470                                    device_ptr output_ptr,
471                                    DenoisingTask *task)
472         {
473                 mem_zero(task->storage.XtWX);
474                 mem_zero(task->storage.XtWY);
475
476                 float *difference     = (float*) task->reconstruction_state.temporary_1_ptr;
477                 float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
478
479                 int r = task->radius;
480                 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
481                         int dy = i / (2*r+1) - r;
482                         int dx = i % (2*r+1) - r;
483
484                         int local_rect[4] = {max(0, -dx), max(0, -dy),
485                                              task->reconstruction_state.source_w - max(0, dx),
486                                              task->reconstruction_state.source_h - max(0, dy)};
487                         filter_nlm_calc_difference_kernel()(dx, dy,
488                                                             (float*) color_ptr,
489                                                             (float*) color_variance_ptr,
490                                                             difference,
491                                                             local_rect,
492                                                             task->buffer.w,
493                                                             task->buffer.pass_stride,
494                                                             1.0f,
495                                                             task->nlm_k_2);
496                         filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
497                         filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
498                         filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
499                         filter_nlm_construct_gramian_kernel()(dx, dy,
500                                                               blurDifference,
501                                                               (float*)  task->buffer.mem.device_pointer,
502                                                               (float*)  task->storage.transform.device_pointer,
503                                                               (int*)    task->storage.rank.device_pointer,
504                                                               (float*)  task->storage.XtWX.device_pointer,
505                                                               (float3*) task->storage.XtWY.device_pointer,
506                                                               local_rect,
507                                                               &task->reconstruction_state.filter_rect.x,
508                                                               task->buffer.w,
509                                                               task->buffer.h,
510                                                               4,
511                                                               task->buffer.pass_stride);
512                 }
513                 for(int y = 0; y < task->filter_area.w; y++) {
514                         for(int x = 0; x < task->filter_area.z; x++) {
515                                 filter_finalize_kernel()(x,
516                                                          y,
517                                                          y*task->filter_area.z + x,
518                                                          task->buffer.w,
519                                                          task->buffer.h,
520                                                          (float*)  output_ptr,
521                                                          (int*)    task->storage.rank.device_pointer,
522                                                          (float*)  task->storage.XtWX.device_pointer,
523                                                          (float3*) task->storage.XtWY.device_pointer,
524                                                          &task->reconstruction_state.buffer_params.x,
525                                                          task->render_buffer.samples);
526                         }
527                 }
528                 return true;
529         }
530
531         bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
532                                       device_ptr mean_ptr, device_ptr variance_ptr,
533                                       int r, int4 rect, DenoisingTask * /*task*/)
534         {
535                 for(int y = rect.y; y < rect.w; y++) {
536                         for(int x = rect.x; x < rect.z; x++) {
537                                 filter_combine_halves_kernel()(x, y,
538                                                                (float*) mean_ptr,
539                                                                (float*) variance_ptr,
540                                                                (float*) a_ptr,
541                                                                (float*) b_ptr,
542                                                                &rect.x,
543                                                                r);
544                         }
545                 }
546                 return true;
547         }
548
549         bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
550                                      device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
551                                      device_ptr buffer_variance_ptr, DenoisingTask *task)
552         {
553                 for(int y = task->rect.y; y < task->rect.w; y++) {
554                         for(int x = task->rect.x; x < task->rect.z; x++) {
555                                 filter_divide_shadow_kernel()(task->render_buffer.samples,
556                                                               task->tiles,
557                                                               x, y,
558                                                               (float*) a_ptr,
559                                                               (float*) b_ptr,
560                                                               (float*) sample_variance_ptr,
561                                                               (float*) sv_variance_ptr,
562                                                               (float*) buffer_variance_ptr,
563                                                               &task->rect.x,
564                                                               task->render_buffer.pass_stride,
565                                                               task->render_buffer.denoising_data_offset,
566                                                               use_split_kernel);
567                         }
568                 }
569                 return true;
570         }
571
572         bool denoising_get_feature(int mean_offset,
573                                    int variance_offset,
574                                    device_ptr mean_ptr,
575                                    device_ptr variance_ptr,
576                                    DenoisingTask *task)
577         {
578                 for(int y = task->rect.y; y < task->rect.w; y++) {
579                         for(int x = task->rect.x; x < task->rect.z; x++) {
580                                 filter_get_feature_kernel()(task->render_buffer.samples,
581                                                             task->tiles,
582                                                             mean_offset,
583                                                             variance_offset,
584                                                             x, y,
585                                                             (float*) mean_ptr,
586                                                             (float*) variance_ptr,
587                                                             &task->rect.x,
588                                                             task->render_buffer.pass_stride,
589                                                             task->render_buffer.denoising_data_offset,
590                                                             use_split_kernel);
591                         }
592                 }
593                 return true;
594         }
595
596         bool denoising_detect_outliers(device_ptr image_ptr,
597                                        device_ptr variance_ptr,
598                                        device_ptr depth_ptr,
599                                        device_ptr output_ptr,
600                                        DenoisingTask *task)
601         {
602                 for(int y = task->rect.y; y < task->rect.w; y++) {
603                         for(int x = task->rect.x; x < task->rect.z; x++) {
604                                 filter_detect_outliers_kernel()(x, y,
605                                                                 (float*) image_ptr,
606                                                                 (float*) variance_ptr,
607                                                                 (float*) depth_ptr,
608                                                                 (float*) output_ptr,
609                                                                 &task->rect.x,
610                                                                 task->buffer.pass_stride);
611                         }
612                 }
613                 return true;
614         }
615
616         void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
617         {
618                 float *render_buffer = (float*)tile.buffer;
619                 uint *rng_state = (uint*)tile.rng_state;
620                 int start_sample = tile.start_sample;
621                 int end_sample = tile.start_sample + tile.num_samples;
622
623                 for(int sample = start_sample; sample < end_sample; sample++) {
624                         if(task.get_cancel() || task_pool.canceled()) {
625                                 if(task.need_finish_queue == false)
626                                         break;
627                         }
628
629                         for(int y = tile.y; y < tile.y + tile.h; y++) {
630                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
631                                         path_trace_kernel()(kg, render_buffer, rng_state,
632                                                             sample, x, y, tile.offset, tile.stride);
633                                 }
634                         }
635
636                         tile.sample = sample + 1;
637
638                         task.update_progress(&tile, tile.w*tile.h);
639                 }
640         }
641
642         void denoise(DeviceTask &task, RenderTile &tile)
643         {
644                 tile.sample = tile.start_sample + tile.num_samples;
645
646                 DenoisingTask denoising(this);
647
648                 denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
649                 denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
650                 denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
651                 denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
652                 denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
653                 denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
654                 denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
655                 denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
656
657                 denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
658                 denoising.render_buffer.samples = tile.sample;
659
660                 RenderTile rtiles[9];
661                 rtiles[4] = tile;
662                 task.map_neighbor_tiles(rtiles, this);
663                 denoising.tiles_from_rendertiles(rtiles);
664
665                 denoising.init_from_devicetask(task);
666
667                 denoising.run_denoising();
668
669                 task.unmap_neighbor_tiles(rtiles, this);
670
671                 task.update_progress(&tile, tile.w*tile.h);
672         }
673
674         void thread_render(DeviceTask& task)
675         {
676                 if(task_pool.canceled()) {
677                         if(task.need_finish_queue == false)
678                                 return;
679                 }
680
681                 /* allocate buffer for kernel globals */
682                 device_only_memory<KernelGlobals> kgbuffer;
683                 kgbuffer.resize(1);
684                 mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
685
686                 KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
687
688                 CPUSplitKernel *split_kernel = NULL;
689                 if(use_split_kernel) {
690                         split_kernel = new CPUSplitKernel(this);
691                         requested_features.max_closure = MAX_CLOSURE;
692                         if(!split_kernel->load_kernels(requested_features)) {
693                                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
694                                 mem_free(kgbuffer);
695
696                                 delete split_kernel;
697                                 return;
698                         }
699                 }
700
701                 RenderTile tile;
702                 while(task.acquire_tile(this, tile)) {
703                         if(tile.task == RenderTile::PATH_TRACE) {
704                                 if(use_split_kernel) {
705                                         device_memory data;
706                                         split_kernel->path_trace(&task, tile, kgbuffer, data);
707                                 }
708                                 else {
709                                         path_trace(task, tile, kg);
710                                 }
711                         }
712                         else if(tile.task == RenderTile::DENOISE) {
713                                 denoise(task, tile);
714                         }
715
716                         task.release_tile(tile);
717
718                         if(task_pool.canceled()) {
719                                 if(task.need_finish_queue == false)
720                                         break;
721                         }
722                 }
723
724                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
725                 kg->~KernelGlobals();
726                 mem_free(kgbuffer);
727                 delete split_kernel;
728         }
729
730         void thread_film_convert(DeviceTask& task)
731         {
732                 float sample_scale = 1.0f/(task.sample + 1);
733
734                 if(task.rgba_half) {
735                         for(int y = task.y; y < task.y + task.h; y++)
736                                 for(int x = task.x; x < task.x + task.w; x++)
737                                         convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
738                                                                        sample_scale, x, y, task.offset, task.stride);
739                 }
740                 else {
741                         for(int y = task.y; y < task.y + task.h; y++)
742                                 for(int x = task.x; x < task.x + task.w; x++)
743                                         convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
744                                                                  sample_scale, x, y, task.offset, task.stride);
745
746                 }
747         }
748
749         void thread_shader(DeviceTask& task)
750         {
751                 KernelGlobals kg = kernel_globals;
752
753 #ifdef WITH_OSL
754                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
755 #endif
756                 for(int sample = 0; sample < task.num_samples; sample++) {
757                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
758                                 shader_kernel()(&kg,
759                                                 (uint4*)task.shader_input,
760                                                 (float4*)task.shader_output,
761                                                 (float*)task.shader_output_luma,
762                                                 task.shader_eval_type,
763                                                 task.shader_filter,
764                                                 x,
765                                                 task.offset,
766                                                 sample);
767
768                         if(task.get_cancel() || task_pool.canceled())
769                                 break;
770
771                         task.update_progress(NULL);
772
773                 }
774
775 #ifdef WITH_OSL
776                 OSLShader::thread_free(&kg);
777 #endif
778         }
779
780         int get_split_task_count(DeviceTask& task)
781         {
782                 if(task.type == DeviceTask::SHADER)
783                         return task.get_subtask_count(TaskScheduler::num_threads(), 256);
784                 else
785                         return task.get_subtask_count(TaskScheduler::num_threads());
786         }
787
788         void task_add(DeviceTask& task)
789         {
790                 /* split task into smaller ones */
791                 list<DeviceTask> tasks;
792
793                 if(task.type == DeviceTask::SHADER)
794                         task.split(tasks, TaskScheduler::num_threads(), 256);
795                 else
796                         task.split(tasks, TaskScheduler::num_threads());
797
798                 foreach(DeviceTask& task, tasks)
799                         task_pool.push(new CPUDeviceTask(this, task));
800         }
801
802         void task_wait()
803         {
804                 task_pool.wait_work();
805         }
806
807         void task_cancel()
808         {
809                 task_pool.cancel();
810         }
811
812 protected:
813         inline KernelGlobals thread_kernel_globals_init()
814         {
815                 KernelGlobals kg = kernel_globals;
816                 kg.transparent_shadow_intersections = NULL;
817                 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
818                                             sizeof(*kg.decoupled_volume_steps);
819                 for(int i = 0; i < decoupled_count; ++i) {
820                         kg.decoupled_volume_steps[i] = NULL;
821                 }
822                 kg.decoupled_volume_steps_index = 0;
823 #ifdef WITH_OSL
824                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
825 #endif
826                 return kg;
827         }
828
829         inline void thread_kernel_globals_free(KernelGlobals *kg)
830         {
831                 if(kg == NULL) {
832                         return;
833                 }
834
835                 if(kg->transparent_shadow_intersections != NULL) {
836                         free(kg->transparent_shadow_intersections);
837                 }
838                 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
839                                             sizeof(*kg->decoupled_volume_steps);
840                 for(int i = 0; i < decoupled_count; ++i) {
841                         if(kg->decoupled_volume_steps[i] != NULL) {
842                                 free(kg->decoupled_volume_steps[i]);
843                         }
844                 }
845 #ifdef WITH_OSL
846                 OSLShader::thread_free(kg);
847 #endif
848         }
849
850         virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
851                 requested_features = requested_features_;
852
853                 return true;
854         }
855 };
856
857 /* split kernel */
858
859 class CPUSplitKernelFunction : public SplitKernelFunction {
860 public:
861         CPUDevice* device;
862         void (*func)(KernelGlobals *kg, KernelData *data);
863
864         CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
865         ~CPUSplitKernelFunction() {}
866
867         virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
868         {
869                 if(!func) {
870                         return false;
871                 }
872
873                 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
874                 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
875
876                 for(int y = 0; y < dim.global_size[1]; y++) {
877                         for(int x = 0; x < dim.global_size[0]; x++) {
878                                 kg->global_id = make_int2(x, y);
879
880                                 func(kg, (KernelData*)data.device_pointer);
881                         }
882                 }
883
884                 return true;
885         }
886 };
887
888 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
889 {
890 }
891
892 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
893                                                     RenderTile& rtile,
894                                                     int num_global_elements,
895                                                     device_memory& kernel_globals,
896                                                     device_memory& data,
897                                                     device_memory& split_data,
898                                                     device_memory& ray_state,
899                                                     device_memory& queue_index,
900                                                     device_memory& use_queues_flags,
901                                                     device_memory& work_pool_wgs)
902 {
903         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
904         kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
905
906         for(int y = 0; y < dim.global_size[1]; y++) {
907                 for(int x = 0; x < dim.global_size[0]; x++) {
908                         kg->global_id = make_int2(x, y);
909
910                         device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
911                                                    (KernelData*)data.device_pointer,
912                                                    (void*)split_data.device_pointer,
913                                                    num_global_elements,
914                                                    (char*)ray_state.device_pointer,
915                                                    (uint*)rtile.rng_state,
916                                                    rtile.start_sample,
917                                                    rtile.start_sample + rtile.num_samples,
918                                                    rtile.x,
919                                                    rtile.y,
920                                                    rtile.w,
921                                                    rtile.h,
922                                                    rtile.offset,
923                                                    rtile.stride,
924                                                    (int*)queue_index.device_pointer,
925                                                    dim.global_size[0] * dim.global_size[1],
926                                                    (char*)use_queues_flags.device_pointer,
927                                                    (uint*)work_pool_wgs.device_pointer,
928                                                    rtile.num_samples,
929                                                    (float*)rtile.buffer);
930                 }
931         }
932
933         return true;
934 }
935
936 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
937                                                                const DeviceRequestedFeatures&)
938 {
939         CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
940
941         kernel->func = device->split_kernels[kernel_name]();
942         if(!kernel->func) {
943                 delete kernel;
944                 return NULL;
945         }
946
947         return kernel;
948 }
949
950 int2 CPUSplitKernel::split_kernel_local_size()
951 {
952         return make_int2(1, 1);
953 }
954
955 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
956         return make_int2(1, 1);
957 }
958
959 uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
960         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
961
962         return split_data_buffer_size(kg, num_threads);
963 }
964
965 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
966 {
967         return new CPUDevice(info, stats, background);
968 }
969
970 void device_cpu_info(vector<DeviceInfo>& devices)
971 {
972         DeviceInfo info;
973
974         info.type = DEVICE_CPU;
975         info.description = system_cpu_brand_string();
976         info.id = "CPU";
977         info.num = 0;
978         info.advanced_shading = true;
979         info.pack_images = false;
980
981         devices.insert(devices.begin(), info);
982 }
983
984 string device_cpu_capabilities(void)
985 {
986         string capabilities = "";
987         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
988         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
989         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
990         capabilities += system_cpu_support_avx() ? "AVX " : "";
991         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
992         if(capabilities[capabilities.size() - 1] == ' ')
993                 capabilities.resize(capabilities.size() - 1);
994         return capabilities;
995 }
996
997 CCL_NAMESPACE_END