2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 /* So ImathMath is included before our kernel_cpu_compat. */
22 /* So no context pollution happens from indirectly included windows.h */
23 # include "util/util_windows.h"
24 # include <OSL/oslexec.h>
27 #include "device/device.h"
28 #include "device/device_intern.h"
29 #include "device/device_split_kernel.h"
31 #include "kernel/kernel.h"
32 #include "kernel/kernel_compat_cpu.h"
33 #include "kernel/kernel_types.h"
34 #include "kernel/split/kernel_split_data.h"
35 #include "kernel/kernel_globals.h"
37 #include "kernel/osl/osl_shader.h"
38 #include "kernel/osl/osl_globals.h"
40 #include "render/buffers.h"
42 #include "util/util_debug.h"
43 #include "util/util_foreach.h"
44 #include "util/util_function.h"
45 #include "util/util_logging.h"
46 #include "util/util_map.h"
47 #include "util/util_opengl.h"
48 #include "util/util_progress.h"
49 #include "util/util_system.h"
50 #include "util/util_thread.h"
56 class CPUSplitKernel : public DeviceSplitKernel {
59 explicit CPUSplitKernel(CPUDevice *device);
61 virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
63 int num_global_elements,
64 device_memory& kernel_globals,
65 device_memory& kernel_data_,
66 device_memory& split_data,
67 device_memory& ray_state,
68 device_memory& queue_index,
69 device_memory& use_queues_flag,
70 device_memory& work_pool_wgs);
72 virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
73 virtual int2 split_kernel_local_size();
74 virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
75 virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
78 class CPUDevice : public Device
80 static unordered_map<string, void*> kernel_functions;
82 static void register_kernel_function(const char* name, void* func)
84 kernel_functions[name] = func;
87 static const char* get_arch_name()
89 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
90 if(system_cpu_support_avx2()) {
95 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
96 if(system_cpu_support_avx()) {
101 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
102 if(system_cpu_support_sse41()) {
107 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
108 if(system_cpu_support_sse3()) {
113 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
114 if(system_cpu_support_sse2()) {
125 static F get_kernel_function(string name)
127 name = string("kernel_") + get_arch_name() + "_" + name;
129 unordered_map<string, void*>::iterator it = kernel_functions.find(name);
131 if(it == kernel_functions.end()) {
132 assert(!"kernel function not found");
136 return (F)it->second;
139 friend class CPUSplitKernel;
143 KernelGlobals kernel_globals;
146 OSLGlobals osl_globals;
149 bool use_split_kernel;
151 DeviceRequestedFeatures requested_features;
153 CPUDevice(DeviceInfo& info, Stats &stats, bool background)
154 : Device(info, stats, background)
158 kernel_globals.osl = &osl_globals;
161 /* do now to avoid thread issues */
162 system_cpu_support_sse2();
163 system_cpu_support_sse3();
164 system_cpu_support_sse41();
165 system_cpu_support_avx();
166 system_cpu_support_avx2();
168 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
169 if(system_cpu_support_avx2()) {
170 VLOG(1) << "Will be using AVX2 kernels.";
174 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
175 if(system_cpu_support_avx()) {
176 VLOG(1) << "Will be using AVX kernels.";
180 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
181 if(system_cpu_support_sse41()) {
182 VLOG(1) << "Will be using SSE4.1 kernels.";
186 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
187 if(system_cpu_support_sse3()) {
188 VLOG(1) << "Will be using SSE3kernels.";
192 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
193 if(system_cpu_support_sse2()) {
194 VLOG(1) << "Will be using SSE2 kernels.";
199 VLOG(1) << "Will be using regular kernels.";
202 use_split_kernel = DebugFlags().cpu.split_kernel;
203 if(use_split_kernel) {
204 VLOG(1) << "Will be using split kernel.";
207 kernel_cpu_register_functions(register_kernel_function);
208 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
209 kernel_cpu_sse2_register_functions(register_kernel_function);
211 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
212 kernel_cpu_sse3_register_functions(register_kernel_function);
214 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
215 kernel_cpu_sse41_register_functions(register_kernel_function);
217 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
218 kernel_cpu_avx_register_functions(register_kernel_function);
220 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
221 kernel_cpu_avx2_register_functions(register_kernel_function);
230 virtual bool show_samples() const
232 return (TaskScheduler::num_threads() == 1);
235 void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
238 VLOG(1) << "Buffer allocate: " << name << ", "
239 << string_human_readable_number(mem.memory_size()) << " bytes. ("
240 << string_human_readable_size(mem.memory_size()) << ")";
243 mem.device_pointer = mem.data_pointer;
245 if(!mem.device_pointer) {
246 mem.device_pointer = (device_ptr)malloc(mem.memory_size());
249 mem.device_size = mem.memory_size();
250 stats.mem_alloc(mem.device_size);
253 void mem_copy_to(device_memory& /*mem*/)
258 void mem_copy_from(device_memory& /*mem*/,
259 int /*y*/, int /*w*/, int /*h*/,
265 void mem_zero(device_memory& mem)
267 memset((void*)mem.device_pointer, 0, mem.memory_size());
270 void mem_free(device_memory& mem)
272 if(mem.device_pointer) {
273 if(!mem.data_pointer) {
274 free((void*)mem.device_pointer);
277 mem.device_pointer = 0;
278 stats.mem_free(mem.device_size);
283 void const_copy_to(const char *name, void *host, size_t size)
285 kernel_const_copy(&kernel_globals, name, host, size);
288 void tex_alloc(const char *name,
290 InterpolationType interpolation,
291 ExtensionType extension)
293 VLOG(1) << "Texture allocate: " << name << ", "
294 << string_human_readable_number(mem.memory_size()) << " bytes. ("
295 << string_human_readable_size(mem.memory_size()) << ")";
296 kernel_tex_copy(&kernel_globals,
304 mem.device_pointer = mem.data_pointer;
305 mem.device_size = mem.memory_size();
306 stats.mem_alloc(mem.device_size);
309 void tex_free(device_memory& mem)
311 if(mem.device_pointer) {
312 mem.device_pointer = 0;
313 stats.mem_free(mem.device_size);
327 void thread_run(DeviceTask *task)
329 if(task->type == DeviceTask::PATH_TRACE) {
330 if(!use_split_kernel) {
331 thread_path_trace(*task);
334 thread_path_trace_split(*task);
337 else if(task->type == DeviceTask::FILM_CONVERT)
338 thread_film_convert(*task);
339 else if(task->type == DeviceTask::SHADER)
340 thread_shader(*task);
343 class CPUDeviceTask : public DeviceTask {
345 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
348 run = function_bind(&CPUDevice::thread_run, device, this);
352 void thread_path_trace(DeviceTask& task)
354 if(task_pool.canceled()) {
355 if(task.need_finish_queue == false)
359 KernelGlobals kg = thread_kernel_globals_init();
362 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
364 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
365 if(system_cpu_support_avx2()) {
366 path_trace_kernel = kernel_cpu_avx2_path_trace;
370 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
371 if(system_cpu_support_avx()) {
372 path_trace_kernel = kernel_cpu_avx_path_trace;
376 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
377 if(system_cpu_support_sse41()) {
378 path_trace_kernel = kernel_cpu_sse41_path_trace;
382 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
383 if(system_cpu_support_sse3()) {
384 path_trace_kernel = kernel_cpu_sse3_path_trace;
388 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
389 if(system_cpu_support_sse2()) {
390 path_trace_kernel = kernel_cpu_sse2_path_trace;
395 path_trace_kernel = kernel_cpu_path_trace;
398 while(task.acquire_tile(this, tile)) {
399 float *render_buffer = (float*)tile.buffer;
400 uint *rng_state = (uint*)tile.rng_state;
401 int start_sample = tile.start_sample;
402 int end_sample = tile.start_sample + tile.num_samples;
404 for(int sample = start_sample; sample < end_sample; sample++) {
405 if(task.get_cancel() || task_pool.canceled()) {
406 if(task.need_finish_queue == false)
410 for(int y = tile.y; y < tile.y + tile.h; y++) {
411 for(int x = tile.x; x < tile.x + tile.w; x++) {
412 path_trace_kernel(&kg, render_buffer, rng_state,
413 sample, x, y, tile.offset, tile.stride);
417 tile.sample = sample + 1;
419 task.update_progress(&tile, tile.w*tile.h);
422 task.release_tile(tile);
424 if(task_pool.canceled()) {
425 if(task.need_finish_queue == false)
430 thread_kernel_globals_free(&kg);
433 void thread_path_trace_split(DeviceTask& task)
435 if(task_pool.canceled()) {
436 if(task.need_finish_queue == false)
442 CPUSplitKernel split_kernel(this);
444 /* allocate buffer for kernel globals */
445 device_memory kgbuffer;
446 kgbuffer.resize(sizeof(KernelGlobals));
447 mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
449 KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
450 *kg = thread_kernel_globals_init();
452 requested_features.max_closure = MAX_CLOSURE;
453 if(!split_kernel.load_kernels(requested_features)) {
454 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
460 while(task.acquire_tile(this, tile)) {
462 split_kernel.path_trace(&task, tile, kgbuffer, data);
464 task.release_tile(tile);
466 if(task_pool.canceled()) {
467 if(task.need_finish_queue == false)
472 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
476 void thread_film_convert(DeviceTask& task)
478 float sample_scale = 1.0f/(task.sample + 1);
481 void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
482 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
483 if(system_cpu_support_avx2()) {
484 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
488 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
489 if(system_cpu_support_avx()) {
490 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
494 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
495 if(system_cpu_support_sse41()) {
496 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
500 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
501 if(system_cpu_support_sse3()) {
502 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
506 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
507 if(system_cpu_support_sse2()) {
508 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
513 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
516 for(int y = task.y; y < task.y + task.h; y++)
517 for(int x = task.x; x < task.x + task.w; x++)
518 convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
519 sample_scale, x, y, task.offset, task.stride);
522 void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
523 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
524 if(system_cpu_support_avx2()) {
525 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
529 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
530 if(system_cpu_support_avx()) {
531 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
535 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
536 if(system_cpu_support_sse41()) {
537 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
541 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
542 if(system_cpu_support_sse3()) {
543 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
547 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
548 if(system_cpu_support_sse2()) {
549 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
554 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
557 for(int y = task.y; y < task.y + task.h; y++)
558 for(int x = task.x; x < task.x + task.w; x++)
559 convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
560 sample_scale, x, y, task.offset, task.stride);
565 void thread_shader(DeviceTask& task)
567 KernelGlobals kg = kernel_globals;
570 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
572 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
574 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
575 if(system_cpu_support_avx2()) {
576 shader_kernel = kernel_cpu_avx2_shader;
580 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
581 if(system_cpu_support_avx()) {
582 shader_kernel = kernel_cpu_avx_shader;
586 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
587 if(system_cpu_support_sse41()) {
588 shader_kernel = kernel_cpu_sse41_shader;
592 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
593 if(system_cpu_support_sse3()) {
594 shader_kernel = kernel_cpu_sse3_shader;
598 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
599 if(system_cpu_support_sse2()) {
600 shader_kernel = kernel_cpu_sse2_shader;
605 shader_kernel = kernel_cpu_shader;
608 for(int sample = 0; sample < task.num_samples; sample++) {
609 for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
611 (uint4*)task.shader_input,
612 (float4*)task.shader_output,
613 (float*)task.shader_output_luma,
614 task.shader_eval_type,
620 if(task.get_cancel() || task_pool.canceled())
623 task.update_progress(NULL);
628 OSLShader::thread_free(&kg);
632 int get_split_task_count(DeviceTask& task)
634 if(task.type == DeviceTask::SHADER)
635 return task.get_subtask_count(TaskScheduler::num_threads(), 256);
637 return task.get_subtask_count(TaskScheduler::num_threads());
640 void task_add(DeviceTask& task)
642 /* split task into smaller ones */
643 list<DeviceTask> tasks;
645 if(task.type == DeviceTask::SHADER)
646 task.split(tasks, TaskScheduler::num_threads(), 256);
648 task.split(tasks, TaskScheduler::num_threads());
650 foreach(DeviceTask& task, tasks)
651 task_pool.push(new CPUDeviceTask(this, task));
656 task_pool.wait_work();
665 inline KernelGlobals thread_kernel_globals_init()
667 KernelGlobals kg = kernel_globals;
668 kg.transparent_shadow_intersections = NULL;
669 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
670 sizeof(*kg.decoupled_volume_steps);
671 for(int i = 0; i < decoupled_count; ++i) {
672 kg.decoupled_volume_steps[i] = NULL;
674 kg.decoupled_volume_steps_index = 0;
676 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
681 inline void thread_kernel_globals_free(KernelGlobals *kg)
687 if(kg->transparent_shadow_intersections != NULL) {
688 free(kg->transparent_shadow_intersections);
690 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
691 sizeof(*kg->decoupled_volume_steps);
692 for(int i = 0; i < decoupled_count; ++i) {
693 if(kg->decoupled_volume_steps[i] != NULL) {
694 free(kg->decoupled_volume_steps[i]);
698 OSLShader::thread_free(kg);
702 virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
703 requested_features = requested_features_;
711 class CPUSplitKernelFunction : public SplitKernelFunction {
714 void (*func)(KernelGlobals *kg, KernelData *data);
716 CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
717 ~CPUSplitKernelFunction() {}
719 virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
725 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
726 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
728 for(int y = 0; y < dim.global_size[1]; y++) {
729 for(int x = 0; x < dim.global_size[0]; x++) {
730 kg->global_id = make_int2(x, y);
732 func(kg, (KernelData*)data.device_pointer);
740 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
744 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
746 int num_global_elements,
747 device_memory& kernel_globals,
749 device_memory& split_data,
750 device_memory& ray_state,
751 device_memory& queue_index,
752 device_memory& use_queues_flags,
753 device_memory& work_pool_wgs)
755 typedef void(*data_init_t)(KernelGlobals *kg,
756 ccl_constant KernelData *data,
757 ccl_global void *split_data_buffer,
759 ccl_global char *ray_state,
760 ccl_global uint *rng_state,
763 int sx, int sy, int sw, int sh, int offset, int stride,
764 ccl_global int *Queue_index,
766 ccl_global char *use_queues_flag,
767 ccl_global unsigned int *work_pool_wgs,
768 unsigned int num_samples,
769 ccl_global float *buffer);
771 data_init_t data_init;
773 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
774 if(system_cpu_support_avx2()) {
775 data_init = kernel_cpu_avx2_data_init;
779 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
780 if(system_cpu_support_avx()) {
781 data_init = kernel_cpu_avx_data_init;
785 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
786 if(system_cpu_support_sse41()) {
787 data_init = kernel_cpu_sse41_data_init;
791 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
792 if(system_cpu_support_sse3()) {
793 data_init = kernel_cpu_sse3_data_init;
797 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
798 if(system_cpu_support_sse2()) {
799 data_init = kernel_cpu_sse2_data_init;
804 data_init = kernel_cpu_data_init;
807 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
808 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
810 for(int y = 0; y < dim.global_size[1]; y++) {
811 for(int x = 0; x < dim.global_size[0]; x++) {
812 kg->global_id = make_int2(x, y);
814 data_init((KernelGlobals*)kernel_globals.device_pointer,
815 (KernelData*)data.device_pointer,
816 (void*)split_data.device_pointer,
818 (char*)ray_state.device_pointer,
819 (uint*)rtile.rng_state,
821 rtile.start_sample + rtile.num_samples,
828 (int*)queue_index.device_pointer,
829 dim.global_size[0] * dim.global_size[1],
830 (char*)use_queues_flags.device_pointer,
831 (uint*)work_pool_wgs.device_pointer,
833 (float*)rtile.buffer);
840 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
842 CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
844 kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
853 int2 CPUSplitKernel::split_kernel_local_size()
855 return make_int2(1, 1);
858 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
859 return make_int2(1, 1);
862 uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
863 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
865 return split_data_buffer_size(kg, num_threads);
868 unordered_map<string, void*> CPUDevice::kernel_functions;
870 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
872 return new CPUDevice(info, stats, background);
875 void device_cpu_info(vector<DeviceInfo>& devices)
879 info.type = DEVICE_CPU;
880 info.description = system_cpu_brand_string();
883 info.advanced_shading = true;
884 info.pack_images = false;
886 devices.insert(devices.begin(), info);
889 string device_cpu_capabilities(void)
891 string capabilities = "";
892 capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
893 capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
894 capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
895 capabilities += system_cpu_support_avx() ? "AVX " : "";
896 capabilities += system_cpu_support_avx2() ? "AVX2" : "";
897 if(capabilities[capabilities.size() - 1] == ' ')
898 capabilities.resize(capabilities.size() - 1);