2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 /* So ImathMath is included before our kernel_cpu_compat. */
22 /* So no context pollution happens from indirectly included windows.h */
23 # include "util/util_windows.h"
24 # include <OSL/oslexec.h>
27 #include "device/device.h"
28 #include "device/device_intern.h"
29 #include "device/device_split_kernel.h"
31 #include "kernel/kernel.h"
32 #include "kernel/kernel_compat_cpu.h"
33 #include "kernel/kernel_types.h"
34 #include "kernel/split/kernel_split_data.h"
35 #include "kernel/kernel_globals.h"
37 #include "kernel/osl/osl_shader.h"
38 #include "kernel/osl/osl_globals.h"
40 #include "render/buffers.h"
42 #include "util/util_debug.h"
43 #include "util/util_foreach.h"
44 #include "util/util_function.h"
45 #include "util/util_logging.h"
46 #include "util/util_map.h"
47 #include "util/util_opengl.h"
48 #include "util/util_progress.h"
49 #include "util/util_system.h"
50 #include "util/util_thread.h"
56 class CPUSplitKernel : public DeviceSplitKernel {
59 explicit CPUSplitKernel(CPUDevice *device);
61 virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
63 int num_global_elements,
64 device_memory& kernel_globals,
65 device_memory& kernel_data_,
66 device_memory& split_data,
67 device_memory& ray_state,
68 device_memory& queue_index,
69 device_memory& use_queues_flag,
70 device_memory& work_pool_wgs);
72 virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
73 virtual int2 split_kernel_local_size();
74 virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
75 virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
78 class CPUDevice : public Device
80 static unordered_map<string, void*> kernel_functions;
82 static void register_kernel_function(const char* name, void* func)
84 kernel_functions[name] = func;
87 static const char* get_arch_name()
89 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
90 if(system_cpu_support_avx2()) {
95 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
96 if(system_cpu_support_avx()) {
101 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
102 if(system_cpu_support_sse41()) {
107 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
108 if(system_cpu_support_sse3()) {
113 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
114 if(system_cpu_support_sse2()) {
125 static F get_kernel_function(string name)
127 name = string("kernel_") + get_arch_name() + "_" + name;
129 unordered_map<string, void*>::iterator it = kernel_functions.find(name);
131 if(it == kernel_functions.end()) {
132 assert(!"kernel function not found");
136 return (F)it->second;
139 friend class CPUSplitKernel;
143 KernelGlobals kernel_globals;
146 OSLGlobals osl_globals;
149 bool use_split_kernel;
151 DeviceRequestedFeatures requested_features;
153 CPUDevice(DeviceInfo& info, Stats &stats, bool background)
154 : Device(info, stats, background)
158 kernel_globals.osl = &osl_globals;
161 /* do now to avoid thread issues */
162 system_cpu_support_sse2();
163 system_cpu_support_sse3();
164 system_cpu_support_sse41();
165 system_cpu_support_avx();
166 system_cpu_support_avx2();
168 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
169 if(system_cpu_support_avx2()) {
170 VLOG(1) << "Will be using AVX2 kernels.";
174 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
175 if(system_cpu_support_avx()) {
176 VLOG(1) << "Will be using AVX kernels.";
180 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
181 if(system_cpu_support_sse41()) {
182 VLOG(1) << "Will be using SSE4.1 kernels.";
186 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
187 if(system_cpu_support_sse3()) {
188 VLOG(1) << "Will be using SSE3kernels.";
192 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
193 if(system_cpu_support_sse2()) {
194 VLOG(1) << "Will be using SSE2 kernels.";
199 VLOG(1) << "Will be using regular kernels.";
202 use_split_kernel = DebugFlags().cpu.split_kernel;
203 if(use_split_kernel) {
204 VLOG(1) << "Will be using split kernel.";
207 kernel_cpu_register_functions(register_kernel_function);
208 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
209 kernel_cpu_sse2_register_functions(register_kernel_function);
211 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
212 kernel_cpu_sse3_register_functions(register_kernel_function);
214 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
215 kernel_cpu_sse41_register_functions(register_kernel_function);
217 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
218 kernel_cpu_avx_register_functions(register_kernel_function);
220 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
221 kernel_cpu_avx2_register_functions(register_kernel_function);
230 virtual bool show_samples() const
232 return (TaskScheduler::num_threads() == 1);
235 void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
238 VLOG(1) << "Buffer allocate: " << name << ", "
239 << string_human_readable_number(mem.memory_size()) << " bytes. ("
240 << string_human_readable_size(mem.memory_size()) << ")";
243 mem.device_pointer = mem.data_pointer;
245 if(!mem.device_pointer) {
246 mem.device_pointer = (device_ptr)malloc(mem.memory_size());
249 mem.device_size = mem.memory_size();
250 stats.mem_alloc(mem.device_size);
253 void mem_copy_to(device_memory& /*mem*/)
258 void mem_copy_from(device_memory& /*mem*/,
259 int /*y*/, int /*w*/, int /*h*/,
265 void mem_zero(device_memory& mem)
267 memset((void*)mem.device_pointer, 0, mem.memory_size());
270 void mem_free(device_memory& mem)
272 if(mem.device_pointer) {
273 if(!mem.data_pointer) {
274 free((void*)mem.device_pointer);
277 mem.device_pointer = 0;
278 stats.mem_free(mem.device_size);
283 void const_copy_to(const char *name, void *host, size_t size)
285 kernel_const_copy(&kernel_globals, name, host, size);
288 void tex_alloc(const char *name,
290 InterpolationType interpolation,
291 ExtensionType extension)
293 VLOG(1) << "Texture allocate: " << name << ", "
294 << string_human_readable_number(mem.memory_size()) << " bytes. ("
295 << string_human_readable_size(mem.memory_size()) << ")";
296 kernel_tex_copy(&kernel_globals,
304 mem.device_pointer = mem.data_pointer;
305 mem.device_size = mem.memory_size();
306 stats.mem_alloc(mem.device_size);
309 void tex_free(device_memory& mem)
311 if(mem.device_pointer) {
312 mem.device_pointer = 0;
313 stats.mem_free(mem.device_size);
327 void thread_run(DeviceTask *task)
329 if(task->type == DeviceTask::PATH_TRACE) {
330 if(!use_split_kernel) {
331 thread_path_trace(*task);
334 thread_path_trace_split(*task);
337 else if(task->type == DeviceTask::FILM_CONVERT)
338 thread_film_convert(*task);
339 else if(task->type == DeviceTask::SHADER)
340 thread_shader(*task);
343 class CPUDeviceTask : public DeviceTask {
345 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
348 run = function_bind(&CPUDevice::thread_run, device, this);
352 void thread_path_trace(DeviceTask& task)
354 if(task_pool.canceled()) {
355 if(task.need_finish_queue == false)
359 KernelGlobals kg = thread_kernel_globals_init();
362 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
364 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
365 if(system_cpu_support_avx2()) {
366 path_trace_kernel = kernel_cpu_avx2_path_trace;
370 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
371 if(system_cpu_support_avx()) {
372 path_trace_kernel = kernel_cpu_avx_path_trace;
376 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
377 if(system_cpu_support_sse41()) {
378 path_trace_kernel = kernel_cpu_sse41_path_trace;
382 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
383 if(system_cpu_support_sse3()) {
384 path_trace_kernel = kernel_cpu_sse3_path_trace;
388 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
389 if(system_cpu_support_sse2()) {
390 path_trace_kernel = kernel_cpu_sse2_path_trace;
395 path_trace_kernel = kernel_cpu_path_trace;
398 while(task.acquire_tile(this, tile)) {
399 float *render_buffer = (float*)tile.buffer;
400 uint *rng_state = (uint*)tile.rng_state;
401 int start_sample = tile.start_sample;
402 int end_sample = tile.start_sample + tile.num_samples;
404 for(int sample = start_sample; sample < end_sample; sample++) {
405 if(task.get_cancel() || task_pool.canceled()) {
406 if(task.need_finish_queue == false)
410 for(int y = tile.y; y < tile.y + tile.h; y++) {
411 for(int x = tile.x; x < tile.x + tile.w; x++) {
412 path_trace_kernel(&kg, render_buffer, rng_state,
413 sample, x, y, tile.offset, tile.stride);
417 tile.sample = sample + 1;
419 task.update_progress(&tile, tile.w*tile.h);
422 task.release_tile(tile);
424 if(task_pool.canceled()) {
425 if(task.need_finish_queue == false)
430 thread_kernel_globals_free(&kg);
433 void thread_path_trace_split(DeviceTask& task)
435 if(task_pool.canceled()) {
436 if(task.need_finish_queue == false)
442 CPUSplitKernel split_kernel(this);
444 /* allocate buffer for kernel globals */
445 device_memory kgbuffer;
446 kgbuffer.resize(sizeof(KernelGlobals));
447 mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
449 KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
451 requested_features.max_closure = MAX_CLOSURE;
452 if(!split_kernel.load_kernels(requested_features)) {
453 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
459 while(task.acquire_tile(this, tile)) {
461 split_kernel.path_trace(&task, tile, kgbuffer, data);
463 task.release_tile(tile);
465 if(task_pool.canceled()) {
466 if(task.need_finish_queue == false)
471 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
475 void thread_film_convert(DeviceTask& task)
477 float sample_scale = 1.0f/(task.sample + 1);
480 void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
481 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
482 if(system_cpu_support_avx2()) {
483 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
487 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
488 if(system_cpu_support_avx()) {
489 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
493 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
494 if(system_cpu_support_sse41()) {
495 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
499 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
500 if(system_cpu_support_sse3()) {
501 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
505 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
506 if(system_cpu_support_sse2()) {
507 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
512 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
515 for(int y = task.y; y < task.y + task.h; y++)
516 for(int x = task.x; x < task.x + task.w; x++)
517 convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
518 sample_scale, x, y, task.offset, task.stride);
521 void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
522 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
523 if(system_cpu_support_avx2()) {
524 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
528 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
529 if(system_cpu_support_avx()) {
530 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
534 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
535 if(system_cpu_support_sse41()) {
536 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
540 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
541 if(system_cpu_support_sse3()) {
542 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
546 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
547 if(system_cpu_support_sse2()) {
548 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
553 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
556 for(int y = task.y; y < task.y + task.h; y++)
557 for(int x = task.x; x < task.x + task.w; x++)
558 convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
559 sample_scale, x, y, task.offset, task.stride);
564 void thread_shader(DeviceTask& task)
566 KernelGlobals kg = kernel_globals;
569 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
571 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
573 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
574 if(system_cpu_support_avx2()) {
575 shader_kernel = kernel_cpu_avx2_shader;
579 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
580 if(system_cpu_support_avx()) {
581 shader_kernel = kernel_cpu_avx_shader;
585 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
586 if(system_cpu_support_sse41()) {
587 shader_kernel = kernel_cpu_sse41_shader;
591 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
592 if(system_cpu_support_sse3()) {
593 shader_kernel = kernel_cpu_sse3_shader;
597 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
598 if(system_cpu_support_sse2()) {
599 shader_kernel = kernel_cpu_sse2_shader;
604 shader_kernel = kernel_cpu_shader;
607 for(int sample = 0; sample < task.num_samples; sample++) {
608 for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
610 (uint4*)task.shader_input,
611 (float4*)task.shader_output,
612 (float*)task.shader_output_luma,
613 task.shader_eval_type,
619 if(task.get_cancel() || task_pool.canceled())
622 task.update_progress(NULL);
627 OSLShader::thread_free(&kg);
631 int get_split_task_count(DeviceTask& task)
633 if(task.type == DeviceTask::SHADER)
634 return task.get_subtask_count(TaskScheduler::num_threads(), 256);
636 return task.get_subtask_count(TaskScheduler::num_threads());
639 void task_add(DeviceTask& task)
641 /* split task into smaller ones */
642 list<DeviceTask> tasks;
644 if(task.type == DeviceTask::SHADER)
645 task.split(tasks, TaskScheduler::num_threads(), 256);
647 task.split(tasks, TaskScheduler::num_threads());
649 foreach(DeviceTask& task, tasks)
650 task_pool.push(new CPUDeviceTask(this, task));
655 task_pool.wait_work();
664 inline KernelGlobals thread_kernel_globals_init()
666 KernelGlobals kg = kernel_globals;
667 kg.transparent_shadow_intersections = NULL;
668 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
669 sizeof(*kg.decoupled_volume_steps);
670 for(int i = 0; i < decoupled_count; ++i) {
671 kg.decoupled_volume_steps[i] = NULL;
673 kg.decoupled_volume_steps_index = 0;
675 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
680 inline void thread_kernel_globals_free(KernelGlobals *kg)
686 if(kg->transparent_shadow_intersections != NULL) {
687 free(kg->transparent_shadow_intersections);
689 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
690 sizeof(*kg->decoupled_volume_steps);
691 for(int i = 0; i < decoupled_count; ++i) {
692 if(kg->decoupled_volume_steps[i] != NULL) {
693 free(kg->decoupled_volume_steps[i]);
697 OSLShader::thread_free(kg);
701 virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
702 requested_features = requested_features_;
710 class CPUSplitKernelFunction : public SplitKernelFunction {
713 void (*func)(KernelGlobals *kg, KernelData *data);
715 CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
716 ~CPUSplitKernelFunction() {}
718 virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
724 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
725 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
727 for(int y = 0; y < dim.global_size[1]; y++) {
728 for(int x = 0; x < dim.global_size[0]; x++) {
729 kg->global_id = make_int2(x, y);
731 func(kg, (KernelData*)data.device_pointer);
739 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
743 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
745 int num_global_elements,
746 device_memory& kernel_globals,
748 device_memory& split_data,
749 device_memory& ray_state,
750 device_memory& queue_index,
751 device_memory& use_queues_flags,
752 device_memory& work_pool_wgs)
754 typedef void(*data_init_t)(KernelGlobals *kg,
755 ccl_constant KernelData *data,
756 ccl_global void *split_data_buffer,
758 ccl_global char *ray_state,
759 ccl_global uint *rng_state,
762 int sx, int sy, int sw, int sh, int offset, int stride,
763 ccl_global int *Queue_index,
765 ccl_global char *use_queues_flag,
766 ccl_global unsigned int *work_pool_wgs,
767 unsigned int num_samples,
768 ccl_global float *buffer);
770 data_init_t data_init;
772 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
773 if(system_cpu_support_avx2()) {
774 data_init = kernel_cpu_avx2_data_init;
778 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
779 if(system_cpu_support_avx()) {
780 data_init = kernel_cpu_avx_data_init;
784 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
785 if(system_cpu_support_sse41()) {
786 data_init = kernel_cpu_sse41_data_init;
790 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
791 if(system_cpu_support_sse3()) {
792 data_init = kernel_cpu_sse3_data_init;
796 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
797 if(system_cpu_support_sse2()) {
798 data_init = kernel_cpu_sse2_data_init;
803 data_init = kernel_cpu_data_init;
806 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
807 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
809 for(int y = 0; y < dim.global_size[1]; y++) {
810 for(int x = 0; x < dim.global_size[0]; x++) {
811 kg->global_id = make_int2(x, y);
813 data_init((KernelGlobals*)kernel_globals.device_pointer,
814 (KernelData*)data.device_pointer,
815 (void*)split_data.device_pointer,
817 (char*)ray_state.device_pointer,
818 (uint*)rtile.rng_state,
820 rtile.start_sample + rtile.num_samples,
827 (int*)queue_index.device_pointer,
828 dim.global_size[0] * dim.global_size[1],
829 (char*)use_queues_flags.device_pointer,
830 (uint*)work_pool_wgs.device_pointer,
832 (float*)rtile.buffer);
839 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
841 CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
843 kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
852 int2 CPUSplitKernel::split_kernel_local_size()
854 return make_int2(1, 1);
857 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
858 return make_int2(1, 1);
861 uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
862 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
864 return split_data_buffer_size(kg, num_threads);
867 unordered_map<string, void*> CPUDevice::kernel_functions;
869 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
871 return new CPUDevice(info, stats, background);
874 void device_cpu_info(vector<DeviceInfo>& devices)
878 info.type = DEVICE_CPU;
879 info.description = system_cpu_brand_string();
882 info.advanced_shading = true;
883 info.pack_images = false;
885 devices.insert(devices.begin(), info);
888 string device_cpu_capabilities(void)
890 string capabilities = "";
891 capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
892 capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
893 capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
894 capabilities += system_cpu_support_avx() ? "AVX " : "";
895 capabilities += system_cpu_support_avx2() ? "AVX2" : "";
896 if(capabilities[capabilities.size() - 1] == ' ')
897 capabilities.resize(capabilities.size() - 1);