Fix T74504: Cycles wrong progress bar with CPU adaptive sampling
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util/util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device/device.h"
28 #include "device/device_denoising.h"
29 #include "device/device_intern.h"
30 #include "device/device_split_kernel.h"
31
32 // clang-format off
33 #include "kernel/kernel.h"
34 #include "kernel/kernel_compat_cpu.h"
35 #include "kernel/kernel_types.h"
36 #include "kernel/split/kernel_split_data.h"
37 #include "kernel/kernel_globals.h"
38 #include "kernel/kernel_adaptive_sampling.h"
39
40 #include "kernel/filter/filter.h"
41
42 #include "kernel/osl/osl_shader.h"
43 #include "kernel/osl/osl_globals.h"
44 // clang-format on
45
46 #include "render/buffers.h"
47 #include "render/coverage.h"
48
49 #include "util/util_debug.h"
50 #include "util/util_foreach.h"
51 #include "util/util_function.h"
52 #include "util/util_logging.h"
53 #include "util/util_map.h"
54 #include "util/util_opengl.h"
55 #include "util/util_optimization.h"
56 #include "util/util_progress.h"
57 #include "util/util_system.h"
58 #include "util/util_thread.h"
59
60 CCL_NAMESPACE_BEGIN
61
62 class CPUDevice;
63
64 /* Has to be outside of the class to be shared across template instantiations. */
65 static const char *logged_architecture = "";
66
67 template<typename F> class KernelFunctions {
68  public:
69   KernelFunctions()
70   {
71     kernel = (F)NULL;
72   }
73
74   KernelFunctions(
75       F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
76   {
77     const char *architecture_name = "default";
78     kernel = kernel_default;
79
80     /* Silence potential warnings about unused variables
81      * when compiling without some architectures. */
82     (void)kernel_sse2;
83     (void)kernel_sse3;
84     (void)kernel_sse41;
85     (void)kernel_avx;
86     (void)kernel_avx2;
87 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
88     if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
89       architecture_name = "AVX2";
90       kernel = kernel_avx2;
91     }
92     else
93 #endif
94 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
95         if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
96       architecture_name = "AVX";
97       kernel = kernel_avx;
98     }
99     else
100 #endif
101 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
102         if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
103       architecture_name = "SSE4.1";
104       kernel = kernel_sse41;
105     }
106     else
107 #endif
108 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
109         if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
110       architecture_name = "SSE3";
111       kernel = kernel_sse3;
112     }
113     else
114 #endif
115 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
116         if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
117       architecture_name = "SSE2";
118       kernel = kernel_sse2;
119     }
120 #else
121     {
122       /* Dummy to prevent the architecture if below become
123        * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
124        * is not defined. */
125     }
126 #endif
127
128     if (strcmp(architecture_name, logged_architecture) != 0) {
129       VLOG(1) << "Will be using " << architecture_name << " kernels.";
130       logged_architecture = architecture_name;
131     }
132   }
133
134   inline F operator()() const
135   {
136     assert(kernel);
137     return kernel;
138   }
139
140  protected:
141   F kernel;
142 };
143
144 class CPUSplitKernel : public DeviceSplitKernel {
145   CPUDevice *device;
146
147  public:
148   explicit CPUSplitKernel(CPUDevice *device);
149
150   virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
151                                               RenderTile &rtile,
152                                               int num_global_elements,
153                                               device_memory &kernel_globals,
154                                               device_memory &kernel_data_,
155                                               device_memory &split_data,
156                                               device_memory &ray_state,
157                                               device_memory &queue_index,
158                                               device_memory &use_queues_flag,
159                                               device_memory &work_pool_wgs);
160
161   virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
162                                                          const DeviceRequestedFeatures &);
163   virtual int2 split_kernel_local_size();
164   virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
165   virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
166 };
167
168 class CPUDevice : public Device {
169  public:
170   TaskPool task_pool;
171   KernelGlobals kernel_globals;
172
173   device_vector<TextureInfo> texture_info;
174   bool need_texture_info;
175
176 #ifdef WITH_OSL
177   OSLGlobals osl_globals;
178 #endif
179
180   bool use_split_kernel;
181
182   DeviceRequestedFeatures requested_features;
183
184   KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
185   KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
186       convert_to_half_float_kernel;
187   KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
188       convert_to_byte_kernel;
189   KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
190       shader_kernel;
191
192   KernelFunctions<void (*)(
193       int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
194       filter_divide_shadow_kernel;
195   KernelFunctions<void (*)(
196       int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
197       filter_get_feature_kernel;
198   KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
199       filter_write_feature_kernel;
200   KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
201       filter_detect_outliers_kernel;
202   KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
203       filter_combine_halves_kernel;
204
205   KernelFunctions<void (*)(
206       int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
207       filter_nlm_calc_difference_kernel;
208   KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
209   KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
210   KernelFunctions<void (*)(
211       int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
212       filter_nlm_update_output_kernel;
213   KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
214
215   KernelFunctions<void (*)(
216       float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
217       filter_construct_transform_kernel;
218   KernelFunctions<void (*)(int,
219                            int,
220                            int,
221                            float *,
222                            float *,
223                            float *,
224                            int *,
225                            float *,
226                            float3 *,
227                            int *,
228                            int *,
229                            int,
230                            int,
231                            int,
232                            int,
233                            bool)>
234       filter_nlm_construct_gramian_kernel;
235   KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
236       filter_finalize_kernel;
237
238   KernelFunctions<void (*)(KernelGlobals *,
239                            ccl_constant KernelData *,
240                            ccl_global void *,
241                            int,
242                            ccl_global char *,
243                            int,
244                            int,
245                            int,
246                            int,
247                            int,
248                            int,
249                            int,
250                            int,
251                            ccl_global int *,
252                            int,
253                            ccl_global char *,
254                            ccl_global unsigned int *,
255                            unsigned int,
256                            ccl_global float *)>
257       data_init_kernel;
258   unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
259
260 #define KERNEL_FUNCTIONS(name) \
261   KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
262       KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
263       KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
264
265   CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
266       : Device(info_, stats_, profiler_, background_),
267         texture_info(this, "__texture_info", MEM_TEXTURE),
268 #define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
269         REGISTER_KERNEL(path_trace),
270         REGISTER_KERNEL(convert_to_half_float),
271         REGISTER_KERNEL(convert_to_byte),
272         REGISTER_KERNEL(shader),
273         REGISTER_KERNEL(filter_divide_shadow),
274         REGISTER_KERNEL(filter_get_feature),
275         REGISTER_KERNEL(filter_write_feature),
276         REGISTER_KERNEL(filter_detect_outliers),
277         REGISTER_KERNEL(filter_combine_halves),
278         REGISTER_KERNEL(filter_nlm_calc_difference),
279         REGISTER_KERNEL(filter_nlm_blur),
280         REGISTER_KERNEL(filter_nlm_calc_weight),
281         REGISTER_KERNEL(filter_nlm_update_output),
282         REGISTER_KERNEL(filter_nlm_normalize),
283         REGISTER_KERNEL(filter_construct_transform),
284         REGISTER_KERNEL(filter_nlm_construct_gramian),
285         REGISTER_KERNEL(filter_finalize),
286         REGISTER_KERNEL(data_init)
287 #undef REGISTER_KERNEL
288   {
289     if (info.cpu_threads == 0) {
290       info.cpu_threads = TaskScheduler::num_threads();
291     }
292
293 #ifdef WITH_OSL
294     kernel_globals.osl = &osl_globals;
295 #endif
296     use_split_kernel = DebugFlags().cpu.split_kernel;
297     if (use_split_kernel) {
298       VLOG(1) << "Will be using split kernel.";
299     }
300     need_texture_info = false;
301
302 #define REGISTER_SPLIT_KERNEL(name) \
303   split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
304       KERNEL_FUNCTIONS(name))
305     REGISTER_SPLIT_KERNEL(path_init);
306     REGISTER_SPLIT_KERNEL(scene_intersect);
307     REGISTER_SPLIT_KERNEL(lamp_emission);
308     REGISTER_SPLIT_KERNEL(do_volume);
309     REGISTER_SPLIT_KERNEL(queue_enqueue);
310     REGISTER_SPLIT_KERNEL(indirect_background);
311     REGISTER_SPLIT_KERNEL(shader_setup);
312     REGISTER_SPLIT_KERNEL(shader_sort);
313     REGISTER_SPLIT_KERNEL(shader_eval);
314     REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
315     REGISTER_SPLIT_KERNEL(subsurface_scatter);
316     REGISTER_SPLIT_KERNEL(direct_lighting);
317     REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
318     REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
319     REGISTER_SPLIT_KERNEL(enqueue_inactive);
320     REGISTER_SPLIT_KERNEL(next_iteration_setup);
321     REGISTER_SPLIT_KERNEL(indirect_subsurface);
322     REGISTER_SPLIT_KERNEL(buffer_update);
323     REGISTER_SPLIT_KERNEL(adaptive_stopping);
324     REGISTER_SPLIT_KERNEL(adaptive_filter_x);
325     REGISTER_SPLIT_KERNEL(adaptive_filter_y);
326     REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
327 #undef REGISTER_SPLIT_KERNEL
328 #undef KERNEL_FUNCTIONS
329   }
330
331   ~CPUDevice()
332   {
333     task_pool.stop();
334     texture_info.free();
335   }
336
337   virtual bool show_samples() const
338   {
339     return (info.cpu_threads == 1);
340   }
341
342   virtual BVHLayoutMask get_bvh_layout_mask() const
343   {
344     BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
345     if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
346       bvh_layout_mask |= BVH_LAYOUT_BVH4;
347     }
348     /* MSVC does not support the -march=native switch and you always end up  */
349     /* with an sse2 kernel when you use WITH_KERNEL_NATIVE. We *cannot* feed */
350     /* that kernel BVH8 even if the CPU flags would allow for it. */
351 #if (defined(__x86_64__) || defined(_M_X64)) && !(defined(_MSC_VER) && defined(WITH_KERNEL_NATIVE))
352     if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
353       bvh_layout_mask |= BVH_LAYOUT_BVH8;
354     }
355 #endif
356 #ifdef WITH_EMBREE
357     bvh_layout_mask |= BVH_LAYOUT_EMBREE;
358 #endif /* WITH_EMBREE */
359     return bvh_layout_mask;
360   }
361
362   void load_texture_info()
363   {
364     if (need_texture_info) {
365       texture_info.copy_to_device();
366       need_texture_info = false;
367     }
368   }
369
370   void mem_alloc(device_memory &mem)
371   {
372     if (mem.type == MEM_TEXTURE) {
373       assert(!"mem_alloc not supported for textures.");
374     }
375     else {
376       if (mem.name) {
377         VLOG(1) << "Buffer allocate: " << mem.name << ", "
378                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
379                 << string_human_readable_size(mem.memory_size()) << ")";
380       }
381
382       if (mem.type == MEM_DEVICE_ONLY) {
383         assert(!mem.host_pointer);
384         size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
385         void *data = util_aligned_malloc(mem.memory_size(), alignment);
386         mem.device_pointer = (device_ptr)data;
387       }
388       else {
389         mem.device_pointer = (device_ptr)mem.host_pointer;
390       }
391
392       mem.device_size = mem.memory_size();
393       stats.mem_alloc(mem.device_size);
394     }
395   }
396
397   void mem_copy_to(device_memory &mem)
398   {
399     if (mem.type == MEM_TEXTURE) {
400       tex_free(mem);
401       tex_alloc(mem);
402     }
403     else if (mem.type == MEM_PIXELS) {
404       assert(!"mem_copy_to not supported for pixels.");
405     }
406     else {
407       if (!mem.device_pointer) {
408         mem_alloc(mem);
409       }
410
411       /* copy is no-op */
412     }
413   }
414
415   void mem_copy_from(device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
416   {
417     /* no-op */
418   }
419
420   void mem_zero(device_memory &mem)
421   {
422     if (!mem.device_pointer) {
423       mem_alloc(mem);
424     }
425
426     if (mem.device_pointer) {
427       memset((void *)mem.device_pointer, 0, mem.memory_size());
428     }
429   }
430
431   void mem_free(device_memory &mem)
432   {
433     if (mem.type == MEM_TEXTURE) {
434       tex_free(mem);
435     }
436     else if (mem.device_pointer) {
437       if (mem.type == MEM_DEVICE_ONLY) {
438         util_aligned_free((void *)mem.device_pointer);
439       }
440       mem.device_pointer = 0;
441       stats.mem_free(mem.device_size);
442       mem.device_size = 0;
443     }
444   }
445
446   virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
447   {
448     return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
449   }
450
451   void const_copy_to(const char *name, void *host, size_t size)
452   {
453     kernel_const_copy(&kernel_globals, name, host, size);
454   }
455
456   void tex_alloc(device_memory &mem)
457   {
458     VLOG(1) << "Texture allocate: " << mem.name << ", "
459             << string_human_readable_number(mem.memory_size()) << " bytes. ("
460             << string_human_readable_size(mem.memory_size()) << ")";
461
462     if (mem.interpolation == INTERPOLATION_NONE) {
463       /* Data texture. */
464       kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
465     }
466     else {
467       /* Image Texture. */
468       int flat_slot = 0;
469       if (string_startswith(mem.name, "__tex_image")) {
470         int pos = string(mem.name).rfind("_");
471         flat_slot = atoi(mem.name + pos + 1);
472       }
473       else {
474         assert(0);
475       }
476
477       if (flat_slot >= texture_info.size()) {
478         /* Allocate some slots in advance, to reduce amount
479          * of re-allocations. */
480         texture_info.resize(flat_slot + 128);
481       }
482
483       TextureInfo &info = texture_info[flat_slot];
484       info.data = (uint64_t)mem.host_pointer;
485       info.cl_buffer = 0;
486       info.interpolation = mem.interpolation;
487       info.extension = mem.extension;
488       info.width = mem.data_width;
489       info.height = mem.data_height;
490       info.depth = mem.data_depth;
491
492       need_texture_info = true;
493     }
494
495     mem.device_pointer = (device_ptr)mem.host_pointer;
496     mem.device_size = mem.memory_size();
497     stats.mem_alloc(mem.device_size);
498   }
499
500   void tex_free(device_memory &mem)
501   {
502     if (mem.device_pointer) {
503       mem.device_pointer = 0;
504       stats.mem_free(mem.device_size);
505       mem.device_size = 0;
506       need_texture_info = true;
507     }
508   }
509
510   void *osl_memory()
511   {
512 #ifdef WITH_OSL
513     return &osl_globals;
514 #else
515     return NULL;
516 #endif
517   }
518
519   void thread_run(DeviceTask *task)
520   {
521     if (task->type == DeviceTask::RENDER)
522       thread_render(*task);
523     else if (task->type == DeviceTask::SHADER)
524       thread_shader(*task);
525     else if (task->type == DeviceTask::FILM_CONVERT)
526       thread_film_convert(*task);
527     else if (task->type == DeviceTask::DENOISE_BUFFER)
528       thread_denoise(*task);
529   }
530
531   class CPUDeviceTask : public DeviceTask {
532    public:
533     CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task)
534     {
535       run = function_bind(&CPUDevice::thread_run, device, this);
536     }
537   };
538
539   bool denoising_non_local_means(device_ptr image_ptr,
540                                  device_ptr guide_ptr,
541                                  device_ptr variance_ptr,
542                                  device_ptr out_ptr,
543                                  DenoisingTask *task)
544   {
545     ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
546
547     int4 rect = task->rect;
548     int r = task->nlm_state.r;
549     int f = task->nlm_state.f;
550     float a = task->nlm_state.a;
551     float k_2 = task->nlm_state.k_2;
552
553     int w = align_up(rect.z - rect.x, 4);
554     int h = rect.w - rect.y;
555     int stride = task->buffer.stride;
556     int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
557
558     float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
559     float *blurDifference = temporary_mem;
560     float *difference = temporary_mem + task->buffer.pass_stride;
561     float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
562
563     memset(weightAccum, 0, sizeof(float) * w * h);
564     memset((float *)out_ptr, 0, sizeof(float) * w * h);
565
566     for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
567       int dy = i / (2 * r + 1) - r;
568       int dx = i % (2 * r + 1) - r;
569
570       int local_rect[4] = {
571           max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
572       filter_nlm_calc_difference_kernel()(dx,
573                                           dy,
574                                           (float *)guide_ptr,
575                                           (float *)variance_ptr,
576                                           NULL,
577                                           difference,
578                                           local_rect,
579                                           w,
580                                           channel_offset,
581                                           0,
582                                           a,
583                                           k_2);
584
585       filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
586       filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
587       filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
588
589       filter_nlm_update_output_kernel()(dx,
590                                         dy,
591                                         blurDifference,
592                                         (float *)image_ptr,
593                                         difference,
594                                         (float *)out_ptr,
595                                         weightAccum,
596                                         local_rect,
597                                         channel_offset,
598                                         stride,
599                                         f);
600     }
601
602     int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
603     filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
604
605     return true;
606   }
607
608   bool denoising_construct_transform(DenoisingTask *task)
609   {
610     ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
611
612     for (int y = 0; y < task->filter_area.w; y++) {
613       for (int x = 0; x < task->filter_area.z; x++) {
614         filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
615                                             task->tile_info,
616                                             x + task->filter_area.x,
617                                             y + task->filter_area.y,
618                                             y * task->filter_area.z + x,
619                                             (float *)task->storage.transform.device_pointer,
620                                             (int *)task->storage.rank.device_pointer,
621                                             &task->rect.x,
622                                             task->buffer.pass_stride,
623                                             task->buffer.frame_stride,
624                                             task->buffer.use_time,
625                                             task->radius,
626                                             task->pca_threshold);
627       }
628     }
629     return true;
630   }
631
632   bool denoising_accumulate(device_ptr color_ptr,
633                             device_ptr color_variance_ptr,
634                             device_ptr scale_ptr,
635                             int frame,
636                             DenoisingTask *task)
637   {
638     ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
639
640     float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
641     float *difference = temporary_mem;
642     float *blurDifference = temporary_mem + task->buffer.pass_stride;
643
644     int r = task->radius;
645     int frame_offset = frame * task->buffer.frame_stride;
646     for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
647       int dy = i / (2 * r + 1) - r;
648       int dx = i % (2 * r + 1) - r;
649
650       int local_rect[4] = {max(0, -dx),
651                            max(0, -dy),
652                            task->reconstruction_state.source_w - max(0, dx),
653                            task->reconstruction_state.source_h - max(0, dy)};
654       filter_nlm_calc_difference_kernel()(dx,
655                                           dy,
656                                           (float *)color_ptr,
657                                           (float *)color_variance_ptr,
658                                           (float *)scale_ptr,
659                                           difference,
660                                           local_rect,
661                                           task->buffer.stride,
662                                           task->buffer.pass_stride,
663                                           frame_offset,
664                                           1.0f,
665                                           task->nlm_k_2);
666       filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
667       filter_nlm_calc_weight_kernel()(
668           blurDifference, difference, local_rect, task->buffer.stride, 4);
669       filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
670       filter_nlm_construct_gramian_kernel()(dx,
671                                             dy,
672                                             task->tile_info->frames[frame],
673                                             blurDifference,
674                                             (float *)task->buffer.mem.device_pointer,
675                                             (float *)task->storage.transform.device_pointer,
676                                             (int *)task->storage.rank.device_pointer,
677                                             (float *)task->storage.XtWX.device_pointer,
678                                             (float3 *)task->storage.XtWY.device_pointer,
679                                             local_rect,
680                                             &task->reconstruction_state.filter_window.x,
681                                             task->buffer.stride,
682                                             4,
683                                             task->buffer.pass_stride,
684                                             frame_offset,
685                                             task->buffer.use_time);
686     }
687
688     return true;
689   }
690
691   bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
692   {
693     for (int y = 0; y < task->filter_area.w; y++) {
694       for (int x = 0; x < task->filter_area.z; x++) {
695         filter_finalize_kernel()(x,
696                                  y,
697                                  y * task->filter_area.z + x,
698                                  (float *)output_ptr,
699                                  (int *)task->storage.rank.device_pointer,
700                                  (float *)task->storage.XtWX.device_pointer,
701                                  (float3 *)task->storage.XtWY.device_pointer,
702                                  &task->reconstruction_state.buffer_params.x,
703                                  task->render_buffer.samples);
704       }
705     }
706     return true;
707   }
708
709   bool denoising_combine_halves(device_ptr a_ptr,
710                                 device_ptr b_ptr,
711                                 device_ptr mean_ptr,
712                                 device_ptr variance_ptr,
713                                 int r,
714                                 int4 rect,
715                                 DenoisingTask *task)
716   {
717     ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
718
719     for (int y = rect.y; y < rect.w; y++) {
720       for (int x = rect.x; x < rect.z; x++) {
721         filter_combine_halves_kernel()(x,
722                                        y,
723                                        (float *)mean_ptr,
724                                        (float *)variance_ptr,
725                                        (float *)a_ptr,
726                                        (float *)b_ptr,
727                                        &rect.x,
728                                        r);
729       }
730     }
731     return true;
732   }
733
734   bool denoising_divide_shadow(device_ptr a_ptr,
735                                device_ptr b_ptr,
736                                device_ptr sample_variance_ptr,
737                                device_ptr sv_variance_ptr,
738                                device_ptr buffer_variance_ptr,
739                                DenoisingTask *task)
740   {
741     ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
742
743     for (int y = task->rect.y; y < task->rect.w; y++) {
744       for (int x = task->rect.x; x < task->rect.z; x++) {
745         filter_divide_shadow_kernel()(task->render_buffer.samples,
746                                       task->tile_info,
747                                       x,
748                                       y,
749                                       (float *)a_ptr,
750                                       (float *)b_ptr,
751                                       (float *)sample_variance_ptr,
752                                       (float *)sv_variance_ptr,
753                                       (float *)buffer_variance_ptr,
754                                       &task->rect.x,
755                                       task->render_buffer.pass_stride,
756                                       task->render_buffer.offset);
757       }
758     }
759     return true;
760   }
761
762   bool denoising_get_feature(int mean_offset,
763                              int variance_offset,
764                              device_ptr mean_ptr,
765                              device_ptr variance_ptr,
766                              float scale,
767                              DenoisingTask *task)
768   {
769     ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
770
771     for (int y = task->rect.y; y < task->rect.w; y++) {
772       for (int x = task->rect.x; x < task->rect.z; x++) {
773         filter_get_feature_kernel()(task->render_buffer.samples,
774                                     task->tile_info,
775                                     mean_offset,
776                                     variance_offset,
777                                     x,
778                                     y,
779                                     (float *)mean_ptr,
780                                     (float *)variance_ptr,
781                                     scale,
782                                     &task->rect.x,
783                                     task->render_buffer.pass_stride,
784                                     task->render_buffer.offset);
785       }
786     }
787     return true;
788   }
789
790   bool denoising_write_feature(int out_offset,
791                                device_ptr from_ptr,
792                                device_ptr buffer_ptr,
793                                DenoisingTask *task)
794   {
795     for (int y = 0; y < task->filter_area.w; y++) {
796       for (int x = 0; x < task->filter_area.z; x++) {
797         filter_write_feature_kernel()(task->render_buffer.samples,
798                                       x + task->filter_area.x,
799                                       y + task->filter_area.y,
800                                       &task->reconstruction_state.buffer_params.x,
801                                       (float *)from_ptr,
802                                       (float *)buffer_ptr,
803                                       out_offset,
804                                       &task->rect.x);
805       }
806     }
807     return true;
808   }
809
810   bool denoising_detect_outliers(device_ptr image_ptr,
811                                  device_ptr variance_ptr,
812                                  device_ptr depth_ptr,
813                                  device_ptr output_ptr,
814                                  DenoisingTask *task)
815   {
816     ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
817
818     for (int y = task->rect.y; y < task->rect.w; y++) {
819       for (int x = task->rect.x; x < task->rect.z; x++) {
820         filter_detect_outliers_kernel()(x,
821                                         y,
822                                         (float *)image_ptr,
823                                         (float *)variance_ptr,
824                                         (float *)depth_ptr,
825                                         (float *)output_ptr,
826                                         &task->rect.x,
827                                         task->buffer.pass_stride);
828       }
829     }
830     return true;
831   }
832
833   bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile)
834   {
835     WorkTile wtile;
836     wtile.x = tile.x;
837     wtile.y = tile.y;
838     wtile.w = tile.w;
839     wtile.h = tile.h;
840     wtile.offset = tile.offset;
841     wtile.stride = tile.stride;
842     wtile.buffer = (float *)tile.buffer;
843
844     bool any = false;
845     for (int y = tile.y; y < tile.y + tile.h; ++y) {
846       any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
847     }
848     for (int x = tile.x; x < tile.x + tile.w; ++x) {
849       any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
850     }
851     return (!any);
852   }
853
854   void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
855   {
856     float *render_buffer = (float *)tile.buffer;
857     for (int y = tile.y; y < tile.y + tile.h; y++) {
858       for (int x = tile.x; x < tile.x + tile.w; x++) {
859         int index = tile.offset + x + y * tile.stride;
860         ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
861         if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
862           buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
863           float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f,
864                                                       buffer[kernel_data.film.pass_sample_count]);
865           if (sample_multiplier != 1.0f) {
866             kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
867           }
868         }
869         else {
870           kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
871         }
872       }
873     }
874   }
875
876   void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
877   {
878     const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
879
880     scoped_timer timer(&tile.buffers->render_time);
881
882     Coverage coverage(kg, tile);
883     if (use_coverage) {
884       coverage.init_path_trace();
885     }
886
887     float *render_buffer = (float *)tile.buffer;
888     int start_sample = tile.start_sample;
889     int end_sample = tile.start_sample + tile.num_samples;
890
891     /* Needed for Embree. */
892     SIMD_SET_FLUSH_TO_ZERO;
893
894     for (int sample = start_sample; sample < end_sample; sample++) {
895       if (task.get_cancel() || task_pool.canceled()) {
896         if (task.need_finish_queue == false)
897           break;
898       }
899
900       for (int y = tile.y; y < tile.y + tile.h; y++) {
901         for (int x = tile.x; x < tile.x + tile.w; x++) {
902           if (use_coverage) {
903             coverage.init_pixel(x, y);
904           }
905           path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
906         }
907       }
908       tile.sample = sample + 1;
909
910       if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
911         const bool stop = adaptive_sampling_filter(kg, tile);
912         if (stop) {
913           const int num_progress_samples = end_sample - sample;
914           tile.sample = end_sample;
915           task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
916           break;
917         }
918       }
919
920       task.update_progress(&tile, tile.w * tile.h);
921     }
922     if (use_coverage) {
923       coverage.finalize();
924     }
925
926     if (task.adaptive_sampling.use) {
927       adaptive_sampling_post(tile, kg);
928     }
929   }
930
931   void denoise(DenoisingTask &denoising, RenderTile &tile)
932   {
933     ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
934
935     tile.sample = tile.start_sample + tile.num_samples;
936
937     denoising.functions.construct_transform = function_bind(
938         &CPUDevice::denoising_construct_transform, this, &denoising);
939     denoising.functions.accumulate = function_bind(
940         &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
941     denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
942     denoising.functions.divide_shadow = function_bind(
943         &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
944     denoising.functions.non_local_means = function_bind(
945         &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
946     denoising.functions.combine_halves = function_bind(
947         &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
948     denoising.functions.get_feature = function_bind(
949         &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
950     denoising.functions.write_feature = function_bind(
951         &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
952     denoising.functions.detect_outliers = function_bind(
953         &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
954
955     denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
956     denoising.render_buffer.samples = tile.sample;
957     denoising.buffer.gpu_temporary_mem = false;
958
959     denoising.run_denoising(&tile);
960   }
961
962   void thread_render(DeviceTask &task)
963   {
964     if (task_pool.canceled()) {
965       if (task.need_finish_queue == false)
966         return;
967     }
968
969     /* allocate buffer for kernel globals */
970     device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
971     kgbuffer.alloc_to_device(1);
972
973     KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
974         KernelGlobals(thread_kernel_globals_init());
975
976     profiler.add_state(&kg->profiler);
977
978     CPUSplitKernel *split_kernel = NULL;
979     if (use_split_kernel) {
980       split_kernel = new CPUSplitKernel(this);
981       if (!split_kernel->load_kernels(requested_features)) {
982         thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
983         kgbuffer.free();
984         delete split_kernel;
985         return;
986       }
987     }
988
989     RenderTile tile;
990     DenoisingTask denoising(this, task);
991     denoising.profiler = &kg->profiler;
992
993     while (task.acquire_tile(this, tile, task.tile_types)) {
994       if (tile.task == RenderTile::PATH_TRACE) {
995         if (use_split_kernel) {
996           device_only_memory<uchar> void_buffer(this, "void_buffer");
997           split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
998         }
999         else {
1000           path_trace(task, tile, kg);
1001         }
1002       }
1003       else if (tile.task == RenderTile::DENOISE) {
1004         denoise(denoising, tile);
1005         task.update_progress(&tile, tile.w * tile.h);
1006       }
1007
1008       task.release_tile(tile);
1009
1010       if (task_pool.canceled()) {
1011         if (task.need_finish_queue == false)
1012           break;
1013       }
1014     }
1015
1016     profiler.remove_state(&kg->profiler);
1017
1018     thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
1019     kg->~KernelGlobals();
1020     kgbuffer.free();
1021     delete split_kernel;
1022   }
1023
1024   void thread_denoise(DeviceTask &task)
1025   {
1026     RenderTile tile;
1027     tile.x = task.x;
1028     tile.y = task.y;
1029     tile.w = task.w;
1030     tile.h = task.h;
1031     tile.buffer = task.buffer;
1032     tile.sample = task.sample + task.num_samples;
1033     tile.num_samples = task.num_samples;
1034     tile.start_sample = task.sample;
1035     tile.offset = task.offset;
1036     tile.stride = task.stride;
1037     tile.buffers = task.buffers;
1038
1039     DenoisingTask denoising(this, task);
1040
1041     ProfilingState denoising_profiler_state;
1042     profiler.add_state(&denoising_profiler_state);
1043     denoising.profiler = &denoising_profiler_state;
1044
1045     denoise(denoising, tile);
1046     task.update_progress(&tile, tile.w * tile.h);
1047
1048     profiler.remove_state(&denoising_profiler_state);
1049   }
1050
1051   void thread_film_convert(DeviceTask &task)
1052   {
1053     float sample_scale = 1.0f / (task.sample + 1);
1054
1055     if (task.rgba_half) {
1056       for (int y = task.y; y < task.y + task.h; y++)
1057         for (int x = task.x; x < task.x + task.w; x++)
1058           convert_to_half_float_kernel()(&kernel_globals,
1059                                          (uchar4 *)task.rgba_half,
1060                                          (float *)task.buffer,
1061                                          sample_scale,
1062                                          x,
1063                                          y,
1064                                          task.offset,
1065                                          task.stride);
1066     }
1067     else {
1068       for (int y = task.y; y < task.y + task.h; y++)
1069         for (int x = task.x; x < task.x + task.w; x++)
1070           convert_to_byte_kernel()(&kernel_globals,
1071                                    (uchar4 *)task.rgba_byte,
1072                                    (float *)task.buffer,
1073                                    sample_scale,
1074                                    x,
1075                                    y,
1076                                    task.offset,
1077                                    task.stride);
1078     }
1079   }
1080
1081   void thread_shader(DeviceTask &task)
1082   {
1083     KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
1084
1085     for (int sample = 0; sample < task.num_samples; sample++) {
1086       for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
1087         shader_kernel()(kg,
1088                         (uint4 *)task.shader_input,
1089                         (float4 *)task.shader_output,
1090                         task.shader_eval_type,
1091                         task.shader_filter,
1092                         x,
1093                         task.offset,
1094                         sample);
1095
1096       if (task.get_cancel() || task_pool.canceled())
1097         break;
1098
1099       task.update_progress(NULL);
1100     }
1101
1102     thread_kernel_globals_free(kg);
1103     delete kg;
1104   }
1105
1106   int get_split_task_count(DeviceTask &task)
1107   {
1108     if (task.type == DeviceTask::SHADER)
1109       return task.get_subtask_count(info.cpu_threads, 256);
1110     else
1111       return task.get_subtask_count(info.cpu_threads);
1112   }
1113
1114   void task_add(DeviceTask &task)
1115   {
1116     /* Load texture info. */
1117     load_texture_info();
1118
1119     /* split task into smaller ones */
1120     list<DeviceTask> tasks;
1121
1122     if (task.type == DeviceTask::SHADER)
1123       task.split(tasks, info.cpu_threads, 256);
1124     else
1125       task.split(tasks, info.cpu_threads);
1126
1127     foreach (DeviceTask &task, tasks)
1128       task_pool.push(new CPUDeviceTask(this, task));
1129   }
1130
1131   void task_wait()
1132   {
1133     task_pool.wait_work();
1134   }
1135
1136   void task_cancel()
1137   {
1138     task_pool.cancel();
1139   }
1140
1141  protected:
1142   inline KernelGlobals thread_kernel_globals_init()
1143   {
1144     KernelGlobals kg = kernel_globals;
1145     kg.transparent_shadow_intersections = NULL;
1146     const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
1147                                 sizeof(*kg.decoupled_volume_steps);
1148     for (int i = 0; i < decoupled_count; ++i) {
1149       kg.decoupled_volume_steps[i] = NULL;
1150     }
1151     kg.decoupled_volume_steps_index = 0;
1152     kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
1153 #ifdef WITH_OSL
1154     OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
1155 #endif
1156     return kg;
1157   }
1158
1159   inline void thread_kernel_globals_free(KernelGlobals *kg)
1160   {
1161     if (kg == NULL) {
1162       return;
1163     }
1164
1165     if (kg->transparent_shadow_intersections != NULL) {
1166       free(kg->transparent_shadow_intersections);
1167     }
1168     const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
1169                                 sizeof(*kg->decoupled_volume_steps);
1170     for (int i = 0; i < decoupled_count; ++i) {
1171       if (kg->decoupled_volume_steps[i] != NULL) {
1172         free(kg->decoupled_volume_steps[i]);
1173       }
1174     }
1175 #ifdef WITH_OSL
1176     OSLShader::thread_free(kg);
1177 #endif
1178   }
1179
1180   virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_)
1181   {
1182     requested_features = requested_features_;
1183
1184     return true;
1185   }
1186 };
1187
1188 /* split kernel */
1189
1190 class CPUSplitKernelFunction : public SplitKernelFunction {
1191  public:
1192   CPUDevice *device;
1193   void (*func)(KernelGlobals *kg, KernelData *data);
1194
1195   CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
1196   {
1197   }
1198   ~CPUSplitKernelFunction()
1199   {
1200   }
1201
1202   virtual bool enqueue(const KernelDimensions &dim,
1203                        device_memory &kernel_globals,
1204                        device_memory &data)
1205   {
1206     if (!func) {
1207       return false;
1208     }
1209
1210     KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
1211     kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
1212
1213     for (int y = 0; y < dim.global_size[1]; y++) {
1214       for (int x = 0; x < dim.global_size[0]; x++) {
1215         kg->global_id = make_int2(x, y);
1216
1217         func(kg, (KernelData *)data.device_pointer);
1218       }
1219     }
1220
1221     return true;
1222   }
1223 };
1224
1225 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
1226 {
1227 }
1228
1229 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
1230                                                     RenderTile &rtile,
1231                                                     int num_global_elements,
1232                                                     device_memory &kernel_globals,
1233                                                     device_memory &data,
1234                                                     device_memory &split_data,
1235                                                     device_memory &ray_state,
1236                                                     device_memory &queue_index,
1237                                                     device_memory &use_queues_flags,
1238                                                     device_memory &work_pool_wgs)
1239 {
1240   KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
1241   kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
1242
1243   for (int y = 0; y < dim.global_size[1]; y++) {
1244     for (int x = 0; x < dim.global_size[0]; x++) {
1245       kg->global_id = make_int2(x, y);
1246
1247       device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
1248                                  (KernelData *)data.device_pointer,
1249                                  (void *)split_data.device_pointer,
1250                                  num_global_elements,
1251                                  (char *)ray_state.device_pointer,
1252                                  rtile.start_sample,
1253                                  rtile.start_sample + rtile.num_samples,
1254                                  rtile.x,
1255                                  rtile.y,
1256                                  rtile.w,
1257                                  rtile.h,
1258                                  rtile.offset,
1259                                  rtile.stride,
1260                                  (int *)queue_index.device_pointer,
1261                                  dim.global_size[0] * dim.global_size[1],
1262                                  (char *)use_queues_flags.device_pointer,
1263                                  (uint *)work_pool_wgs.device_pointer,
1264                                  rtile.num_samples,
1265                                  (float *)rtile.buffer);
1266     }
1267   }
1268
1269   return true;
1270 }
1271
1272 SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
1273                                                                const DeviceRequestedFeatures &)
1274 {
1275   CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
1276
1277   kernel->func = device->split_kernels[kernel_name]();
1278   if (!kernel->func) {
1279     delete kernel;
1280     return NULL;
1281   }
1282
1283   return kernel;
1284 }
1285
1286 int2 CPUSplitKernel::split_kernel_local_size()
1287 {
1288   return make_int2(1, 1);
1289 }
1290
1291 int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
1292                                               device_memory & /*data*/,
1293                                               DeviceTask * /*task*/)
1294 {
1295   return make_int2(1, 1);
1296 }
1297
1298 uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
1299                                            device_memory & /*data*/,
1300                                            size_t num_threads)
1301 {
1302   KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
1303
1304   return split_data_buffer_size(kg, num_threads);
1305 }
1306
1307 Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
1308 {
1309   return new CPUDevice(info, stats, profiler, background);
1310 }
1311
1312 void device_cpu_info(vector<DeviceInfo> &devices)
1313 {
1314   DeviceInfo info;
1315
1316   info.type = DEVICE_CPU;
1317   info.description = system_cpu_brand_string();
1318   info.id = "CPU";
1319   info.num = 0;
1320   info.has_volume_decoupled = true;
1321   info.has_osl = true;
1322   info.has_half_images = true;
1323   info.has_profiling = true;
1324
1325   devices.insert(devices.begin(), info);
1326 }
1327
1328 string device_cpu_capabilities()
1329 {
1330   string capabilities = "";
1331   capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
1332   capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
1333   capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
1334   capabilities += system_cpu_support_avx() ? "AVX " : "";
1335   capabilities += system_cpu_support_avx2() ? "AVX2" : "";
1336   if (capabilities[capabilities.size() - 1] == ' ')
1337     capabilities.resize(capabilities.size() - 1);
1338   return capabilities;
1339 }
1340
1341 CCL_NAMESPACE_END