2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 /* So ImathMath is included before our kernel_cpu_compat. */
22 /* So no context pollution happens from indirectly included windows.h */
23 # include "util/util_windows.h"
24 # include <OSL/oslexec.h>
27 #include "device/device.h"
28 #include "device/device_denoising.h"
29 #include "device/device_intern.h"
30 #include "device/device_split_kernel.h"
32 #include "kernel/kernel.h"
33 #include "kernel/kernel_compat_cpu.h"
34 #include "kernel/kernel_types.h"
35 #include "kernel/split/kernel_split_data.h"
36 #include "kernel/kernel_globals.h"
38 #include "kernel/filter/filter.h"
40 #include "kernel/osl/osl_shader.h"
41 #include "kernel/osl/osl_globals.h"
43 #include "render/buffers.h"
45 #include "util/util_debug.h"
46 #include "util/util_foreach.h"
47 #include "util/util_function.h"
48 #include "util/util_logging.h"
49 #include "util/util_map.h"
50 #include "util/util_opengl.h"
51 #include "util/util_progress.h"
52 #include "util/util_system.h"
53 #include "util/util_thread.h"
59 /* Has to be outside of the class to be shared across template instantiations. */
60 static const char *logged_architecture = "";
63 class KernelFunctions {
70 KernelFunctions(F kernel_default,
77 const char *architecture_name = "default";
78 kernel = kernel_default;
80 /* Silence potential warnings about unused variables
81 * when compiling without some architectures. */
87 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
88 if(system_cpu_support_avx2()) {
89 architecture_name = "AVX2";
94 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
95 if(system_cpu_support_avx()) {
96 architecture_name = "AVX";
101 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
102 if(system_cpu_support_sse41()) {
103 architecture_name = "SSE4.1";
104 kernel = kernel_sse41;
108 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
109 if(system_cpu_support_sse3()) {
110 architecture_name = "SSE3";
111 kernel = kernel_sse3;
115 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
116 if(system_cpu_support_sse2()) {
117 architecture_name = "SSE2";
118 kernel = kernel_sse2;
122 if(strstr(architecture_name, logged_architecture) != 0) {
123 VLOG(1) << "Will be using " << architecture_name << " kernels.";
124 logged_architecture = architecture_name;
128 inline F operator()() const {
136 class CPUSplitKernel : public DeviceSplitKernel {
139 explicit CPUSplitKernel(CPUDevice *device);
141 virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
143 int num_global_elements,
144 device_memory& kernel_globals,
145 device_memory& kernel_data_,
146 device_memory& split_data,
147 device_memory& ray_state,
148 device_memory& queue_index,
149 device_memory& use_queues_flag,
150 device_memory& work_pool_wgs);
152 virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
153 virtual int2 split_kernel_local_size();
154 virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
155 virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
158 class CPUDevice : public Device
162 KernelGlobals kernel_globals;
165 OSLGlobals osl_globals;
168 bool use_split_kernel;
170 DeviceRequestedFeatures requested_features;
172 KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel;
173 KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
174 KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
175 KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
177 KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
178 KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
179 KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
181 KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
182 KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
183 KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
184 KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
185 KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
187 KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
188 KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
189 KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
191 KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
192 ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
193 ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel;
194 unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
196 #define KERNEL_FUNCTIONS(name) \
197 KERNEL_NAME_EVAL(cpu, name), \
198 KERNEL_NAME_EVAL(cpu_sse2, name), \
199 KERNEL_NAME_EVAL(cpu_sse3, name), \
200 KERNEL_NAME_EVAL(cpu_sse41, name), \
201 KERNEL_NAME_EVAL(cpu_avx, name), \
202 KERNEL_NAME_EVAL(cpu_avx2, name)
204 CPUDevice(DeviceInfo& info, Stats &stats, bool background)
205 : Device(info, stats, background),
206 #define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
207 REGISTER_KERNEL(path_trace),
208 REGISTER_KERNEL(convert_to_half_float),
209 REGISTER_KERNEL(convert_to_byte),
210 REGISTER_KERNEL(shader),
211 REGISTER_KERNEL(filter_divide_shadow),
212 REGISTER_KERNEL(filter_get_feature),
213 REGISTER_KERNEL(filter_combine_halves),
214 REGISTER_KERNEL(filter_nlm_calc_difference),
215 REGISTER_KERNEL(filter_nlm_blur),
216 REGISTER_KERNEL(filter_nlm_calc_weight),
217 REGISTER_KERNEL(filter_nlm_update_output),
218 REGISTER_KERNEL(filter_nlm_normalize),
219 REGISTER_KERNEL(filter_construct_transform),
220 REGISTER_KERNEL(filter_nlm_construct_gramian),
221 REGISTER_KERNEL(filter_finalize),
222 REGISTER_KERNEL(data_init)
223 #undef REGISTER_KERNEL
227 kernel_globals.osl = &osl_globals;
229 use_split_kernel = DebugFlags().cpu.split_kernel;
230 if(use_split_kernel) {
231 VLOG(1) << "Will be using split kernel.";
234 #define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
235 REGISTER_SPLIT_KERNEL(path_init);
236 REGISTER_SPLIT_KERNEL(scene_intersect);
237 REGISTER_SPLIT_KERNEL(lamp_emission);
238 REGISTER_SPLIT_KERNEL(do_volume);
239 REGISTER_SPLIT_KERNEL(queue_enqueue);
240 REGISTER_SPLIT_KERNEL(indirect_background);
241 REGISTER_SPLIT_KERNEL(shader_setup);
242 REGISTER_SPLIT_KERNEL(shader_sort);
243 REGISTER_SPLIT_KERNEL(shader_eval);
244 REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
245 REGISTER_SPLIT_KERNEL(subsurface_scatter);
246 REGISTER_SPLIT_KERNEL(direct_lighting);
247 REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
248 REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
249 REGISTER_SPLIT_KERNEL(next_iteration_setup);
250 REGISTER_SPLIT_KERNEL(indirect_subsurface);
251 REGISTER_SPLIT_KERNEL(buffer_update);
252 #undef REGISTER_SPLIT_KERNEL
253 #undef KERNEL_FUNCTIONS
261 virtual bool show_samples() const
263 return (TaskScheduler::num_threads() == 1);
266 void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
269 VLOG(1) << "Buffer allocate: " << name << ", "
270 << string_human_readable_number(mem.memory_size()) << " bytes. ("
271 << string_human_readable_size(mem.memory_size()) << ")";
274 mem.device_pointer = mem.data_pointer;
276 if(!mem.device_pointer) {
277 mem.device_pointer = (device_ptr)malloc(mem.memory_size());
280 mem.device_size = mem.memory_size();
281 stats.mem_alloc(mem.device_size);
284 void mem_copy_to(device_memory& /*mem*/)
289 void mem_copy_from(device_memory& /*mem*/,
290 int /*y*/, int /*w*/, int /*h*/,
296 void mem_zero(device_memory& mem)
298 memset((void*)mem.device_pointer, 0, mem.memory_size());
301 void mem_free(device_memory& mem)
303 if(mem.device_pointer) {
304 if(!mem.data_pointer) {
305 free((void*)mem.device_pointer);
307 mem.device_pointer = 0;
308 stats.mem_free(mem.device_size);
313 virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
315 return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
318 void const_copy_to(const char *name, void *host, size_t size)
320 kernel_const_copy(&kernel_globals, name, host, size);
323 void tex_alloc(const char *name,
325 InterpolationType interpolation,
326 ExtensionType extension)
328 VLOG(1) << "Texture allocate: " << name << ", "
329 << string_human_readable_number(mem.memory_size()) << " bytes. ("
330 << string_human_readable_size(mem.memory_size()) << ")";
331 kernel_tex_copy(&kernel_globals,
339 mem.device_pointer = mem.data_pointer;
340 mem.device_size = mem.memory_size();
341 stats.mem_alloc(mem.device_size);
344 void tex_free(device_memory& mem)
346 if(mem.device_pointer) {
347 mem.device_pointer = 0;
348 stats.mem_free(mem.device_size);
362 void thread_run(DeviceTask *task)
364 if(task->type == DeviceTask::RENDER) {
365 thread_render(*task);
367 else if(task->type == DeviceTask::FILM_CONVERT)
368 thread_film_convert(*task);
369 else if(task->type == DeviceTask::SHADER)
370 thread_shader(*task);
373 class CPUDeviceTask : public DeviceTask {
375 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
378 run = function_bind(&CPUDevice::thread_run, device, this);
382 bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
384 mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
386 TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
387 for(int i = 0; i < 9; i++) {
388 tiles->buffers[i] = buffers[i];
394 bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
397 int4 rect = task->rect;
398 int r = task->nlm_state.r;
399 int f = task->nlm_state.f;
400 float a = task->nlm_state.a;
401 float k_2 = task->nlm_state.k_2;
403 int w = align_up(rect.z-rect.x, 4);
404 int h = rect.w-rect.y;
406 float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
407 float *difference = (float*) task->nlm_state.temporary_2_ptr;
408 float *weightAccum = (float*) task->nlm_state.temporary_3_ptr;
410 memset(weightAccum, 0, sizeof(float)*w*h);
411 memset((float*) out_ptr, 0, sizeof(float)*w*h);
413 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
414 int dy = i / (2*r+1) - r;
415 int dx = i % (2*r+1) - r;
417 int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
418 filter_nlm_calc_difference_kernel()(dx, dy,
420 (float*) variance_ptr,
426 filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
427 filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
428 filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
430 filter_nlm_update_output_kernel()(dx, dy,
439 int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
440 filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
445 bool denoising_construct_transform(DenoisingTask *task)
447 for(int y = 0; y < task->filter_area.w; y++) {
448 for(int x = 0; x < task->filter_area.z; x++) {
449 filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
450 x + task->filter_area.x,
451 y + task->filter_area.y,
452 y*task->filter_area.z + x,
453 (float*) task->storage.transform.device_pointer,
454 (int*) task->storage.rank.device_pointer,
456 task->buffer.pass_stride,
458 task->pca_threshold);
464 bool denoising_reconstruct(device_ptr color_ptr,
465 device_ptr color_variance_ptr,
466 device_ptr guide_ptr,
467 device_ptr guide_variance_ptr,
468 device_ptr output_ptr,
471 mem_zero(task->storage.XtWX);
472 mem_zero(task->storage.XtWY);
474 float *difference = (float*) task->reconstruction_state.temporary_1_ptr;
475 float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
477 int r = task->radius;
478 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
479 int dy = i / (2*r+1) - r;
480 int dx = i % (2*r+1) - r;
482 int local_rect[4] = {max(0, -dx), max(0, -dy),
483 task->reconstruction_state.source_w - max(0, dx),
484 task->reconstruction_state.source_h - max(0, dy)};
485 filter_nlm_calc_difference_kernel()(dx, dy,
487 (float*) guide_variance_ptr,
491 task->buffer.pass_stride,
494 filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
495 filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
496 filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
497 filter_nlm_construct_gramian_kernel()(dx, dy,
499 (float*) task->buffer.mem.device_pointer,
501 (float*) color_variance_ptr,
502 (float*) task->storage.transform.device_pointer,
503 (int*) task->storage.rank.device_pointer,
504 (float*) task->storage.XtWX.device_pointer,
505 (float3*) task->storage.XtWY.device_pointer,
507 &task->reconstruction_state.filter_rect.x,
511 task->buffer.pass_stride);
513 for(int y = 0; y < task->filter_area.w; y++) {
514 for(int x = 0; x < task->filter_area.z; x++) {
515 filter_finalize_kernel()(x,
517 y*task->filter_area.z + x,
521 (int*) task->storage.rank.device_pointer,
522 (float*) task->storage.XtWX.device_pointer,
523 (float3*) task->storage.XtWY.device_pointer,
524 &task->reconstruction_state.buffer_params.x,
525 task->render_buffer.samples);
531 bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
532 device_ptr mean_ptr, device_ptr variance_ptr,
533 int r, int4 rect, DenoisingTask *task)
536 for(int y = rect.y; y < rect.w; y++) {
537 for(int x = rect.x; x < rect.z; x++) {
538 filter_combine_halves_kernel()(x, y,
540 (float*) variance_ptr,
550 bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
551 device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
552 device_ptr buffer_variance_ptr, DenoisingTask *task)
554 for(int y = task->rect.y; y < task->rect.w; y++) {
555 for(int x = task->rect.x; x < task->rect.z; x++) {
556 filter_divide_shadow_kernel()(task->render_buffer.samples,
561 (float*) sample_variance_ptr,
562 (float*) sv_variance_ptr,
563 (float*) buffer_variance_ptr,
565 task->render_buffer.pass_stride,
566 task->render_buffer.denoising_data_offset,
573 bool denoising_get_feature(int mean_offset,
576 device_ptr variance_ptr,
579 for(int y = task->rect.y; y < task->rect.w; y++) {
580 for(int x = task->rect.x; x < task->rect.z; x++) {
581 filter_get_feature_kernel()(task->render_buffer.samples,
587 (float*) variance_ptr,
589 task->render_buffer.pass_stride,
590 task->render_buffer.denoising_data_offset,
597 void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
599 float *render_buffer = (float*)tile.buffer;
600 uint *rng_state = (uint*)tile.rng_state;
601 int start_sample = tile.start_sample;
602 int end_sample = tile.start_sample + tile.num_samples;
604 for(int sample = start_sample; sample < end_sample; sample++) {
605 if(task.get_cancel() || task_pool.canceled()) {
606 if(task.need_finish_queue == false)
610 for(int y = tile.y; y < tile.y + tile.h; y++) {
611 for(int x = tile.x; x < tile.x + tile.w; x++) {
612 path_trace_kernel()(kg, render_buffer, rng_state,
613 sample, x, y, tile.offset, tile.stride);
617 tile.sample = sample + 1;
619 task.update_progress(&tile, tile.w*tile.h);
623 void denoise(DeviceTask &task, RenderTile &tile)
625 tile.sample = tile.start_sample + tile.num_samples;
627 DenoisingTask denoising(this);
629 denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
630 denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
631 denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
632 denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
633 denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
634 denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
635 denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
637 denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
638 denoising.render_buffer.samples = tile.sample;
640 RenderTile rtiles[9];
642 task.map_neighbor_tiles(rtiles, this);
643 denoising.tiles_from_rendertiles(rtiles);
645 denoising.init_from_devicetask(task);
647 denoising.run_denoising();
649 task.unmap_neighbor_tiles(rtiles, this);
651 task.update_progress(&tile, tile.w*tile.h);
654 void thread_render(DeviceTask& task)
656 if(task_pool.canceled()) {
657 if(task.need_finish_queue == false)
661 /* allocate buffer for kernel globals */
662 device_only_memory<KernelGlobals> kgbuffer;
664 mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
666 KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
668 CPUSplitKernel *split_kernel = NULL;
669 if(use_split_kernel) {
670 split_kernel = new CPUSplitKernel(this);
671 requested_features.max_closure = MAX_CLOSURE;
672 if(!split_kernel->load_kernels(requested_features)) {
673 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
682 while(task.acquire_tile(this, tile)) {
683 if(tile.task == RenderTile::PATH_TRACE) {
684 if(use_split_kernel) {
686 split_kernel->path_trace(&task, tile, kgbuffer, data);
689 path_trace(task, tile, kg);
692 else if(tile.task == RenderTile::DENOISE) {
696 task.release_tile(tile);
698 if(task_pool.canceled()) {
699 if(task.need_finish_queue == false)
704 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
709 void thread_film_convert(DeviceTask& task)
711 float sample_scale = 1.0f/(task.sample + 1);
714 for(int y = task.y; y < task.y + task.h; y++)
715 for(int x = task.x; x < task.x + task.w; x++)
716 convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
717 sample_scale, x, y, task.offset, task.stride);
720 for(int y = task.y; y < task.y + task.h; y++)
721 for(int x = task.x; x < task.x + task.w; x++)
722 convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
723 sample_scale, x, y, task.offset, task.stride);
728 void thread_shader(DeviceTask& task)
730 KernelGlobals kg = kernel_globals;
733 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
735 for(int sample = 0; sample < task.num_samples; sample++) {
736 for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
738 (uint4*)task.shader_input,
739 (float4*)task.shader_output,
740 (float*)task.shader_output_luma,
741 task.shader_eval_type,
747 if(task.get_cancel() || task_pool.canceled())
750 task.update_progress(NULL);
755 OSLShader::thread_free(&kg);
759 int get_split_task_count(DeviceTask& task)
761 if(task.type == DeviceTask::SHADER)
762 return task.get_subtask_count(TaskScheduler::num_threads(), 256);
764 return task.get_subtask_count(TaskScheduler::num_threads());
767 void task_add(DeviceTask& task)
769 /* split task into smaller ones */
770 list<DeviceTask> tasks;
772 if(task.type == DeviceTask::SHADER)
773 task.split(tasks, TaskScheduler::num_threads(), 256);
775 task.split(tasks, TaskScheduler::num_threads());
777 foreach(DeviceTask& task, tasks)
778 task_pool.push(new CPUDeviceTask(this, task));
783 task_pool.wait_work();
792 inline KernelGlobals thread_kernel_globals_init()
794 KernelGlobals kg = kernel_globals;
795 kg.transparent_shadow_intersections = NULL;
796 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
797 sizeof(*kg.decoupled_volume_steps);
798 for(int i = 0; i < decoupled_count; ++i) {
799 kg.decoupled_volume_steps[i] = NULL;
801 kg.decoupled_volume_steps_index = 0;
803 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
808 inline void thread_kernel_globals_free(KernelGlobals *kg)
814 if(kg->transparent_shadow_intersections != NULL) {
815 free(kg->transparent_shadow_intersections);
817 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
818 sizeof(*kg->decoupled_volume_steps);
819 for(int i = 0; i < decoupled_count; ++i) {
820 if(kg->decoupled_volume_steps[i] != NULL) {
821 free(kg->decoupled_volume_steps[i]);
825 OSLShader::thread_free(kg);
829 virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
830 requested_features = requested_features_;
838 class CPUSplitKernelFunction : public SplitKernelFunction {
841 void (*func)(KernelGlobals *kg, KernelData *data);
843 CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
844 ~CPUSplitKernelFunction() {}
846 virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
852 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
853 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
855 for(int y = 0; y < dim.global_size[1]; y++) {
856 for(int x = 0; x < dim.global_size[0]; x++) {
857 kg->global_id = make_int2(x, y);
859 func(kg, (KernelData*)data.device_pointer);
867 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
871 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
873 int num_global_elements,
874 device_memory& kernel_globals,
876 device_memory& split_data,
877 device_memory& ray_state,
878 device_memory& queue_index,
879 device_memory& use_queues_flags,
880 device_memory& work_pool_wgs)
882 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
883 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
885 for(int y = 0; y < dim.global_size[1]; y++) {
886 for(int x = 0; x < dim.global_size[0]; x++) {
887 kg->global_id = make_int2(x, y);
889 device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
890 (KernelData*)data.device_pointer,
891 (void*)split_data.device_pointer,
893 (char*)ray_state.device_pointer,
894 (uint*)rtile.rng_state,
896 rtile.start_sample + rtile.num_samples,
903 (int*)queue_index.device_pointer,
904 dim.global_size[0] * dim.global_size[1],
905 (char*)use_queues_flags.device_pointer,
906 (uint*)work_pool_wgs.device_pointer,
908 (float*)rtile.buffer);
915 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
917 CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
919 kernel->func = device->split_kernels[kernel_name]();
928 int2 CPUSplitKernel::split_kernel_local_size()
930 return make_int2(1, 1);
933 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
934 return make_int2(1, 1);
937 uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
938 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
940 return split_data_buffer_size(kg, num_threads);
943 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
945 return new CPUDevice(info, stats, background);
948 void device_cpu_info(vector<DeviceInfo>& devices)
952 info.type = DEVICE_CPU;
953 info.description = system_cpu_brand_string();
956 info.advanced_shading = true;
957 info.pack_images = false;
959 devices.insert(devices.begin(), info);
962 string device_cpu_capabilities(void)
964 string capabilities = "";
965 capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
966 capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
967 capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
968 capabilities += system_cpu_support_avx() ? "AVX " : "";
969 capabilities += system_cpu_support_avx2() ? "AVX2" : "";
970 if(capabilities[capabilities.size() - 1] == ' ')
971 capabilities.resize(capabilities.size() - 1);