2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 /* So ImathMath is included before our kernel_cpu_compat. */
22 /* So no context pollution happens from indirectly included windows.h */
23 # include "util_windows.h"
24 # include <OSL/oslexec.h>
28 #include "device_intern.h"
31 #include "kernel_compat_cpu.h"
32 #include "kernel_types.h"
33 #include "kernel_globals.h"
35 #include "osl_shader.h"
36 #include "osl_globals.h"
40 #include "util_debug.h"
41 #include "util_foreach.h"
42 #include "util_function.h"
43 #include "util_logging.h"
44 #include "util_opengl.h"
45 #include "util_progress.h"
46 #include "util_system.h"
47 #include "util_thread.h"
51 class CPUDevice : public Device
55 KernelGlobals kernel_globals;
58 OSLGlobals osl_globals;
61 CPUDevice(DeviceInfo& info, Stats &stats, bool background)
62 : Device(info, stats, background)
65 kernel_globals.osl = &osl_globals;
68 /* do now to avoid thread issues */
69 system_cpu_support_sse2();
70 system_cpu_support_sse3();
71 system_cpu_support_sse41();
72 system_cpu_support_avx();
73 system_cpu_support_avx2();
75 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
76 if(system_cpu_support_avx2()) {
77 VLOG(1) << "Will be using AVX2 kernels.";
81 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
82 if(system_cpu_support_avx()) {
83 VLOG(1) << "Will be using AVX kernels.";
87 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
88 if(system_cpu_support_sse41()) {
89 VLOG(1) << "Will be using SSE4.1 kernels.";
93 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
94 if(system_cpu_support_sse3()) {
95 VLOG(1) << "Will be using SSE3kernels.";
99 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
100 if(system_cpu_support_sse2()) {
101 VLOG(1) << "Will be using SSE2 kernels.";
106 VLOG(1) << "Will be using regular kernels.";
115 void mem_alloc(device_memory& mem, MemoryType /*type*/)
117 mem.device_pointer = mem.data_pointer;
118 mem.device_size = mem.memory_size();
119 stats.mem_alloc(mem.device_size);
122 void mem_copy_to(device_memory& /*mem*/)
127 void mem_copy_from(device_memory& /*mem*/,
128 int /*y*/, int /*w*/, int /*h*/,
134 void mem_zero(device_memory& mem)
136 memset((void*)mem.device_pointer, 0, mem.memory_size());
139 void mem_free(device_memory& mem)
141 if(mem.device_pointer) {
142 mem.device_pointer = 0;
143 stats.mem_free(mem.device_size);
148 void const_copy_to(const char *name, void *host, size_t size)
150 kernel_const_copy(&kernel_globals, name, host, size);
153 void tex_alloc(const char *name,
155 InterpolationType interpolation,
156 ExtensionType extension)
158 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
159 kernel_tex_copy(&kernel_globals,
167 mem.device_pointer = mem.data_pointer;
168 mem.device_size = mem.memory_size();
169 stats.mem_alloc(mem.device_size);
172 void tex_free(device_memory& mem)
174 if(mem.device_pointer) {
175 mem.device_pointer = 0;
176 stats.mem_free(mem.device_size);
190 void thread_run(DeviceTask *task)
192 if(task->type == DeviceTask::PATH_TRACE)
193 thread_path_trace(*task);
194 else if(task->type == DeviceTask::FILM_CONVERT)
195 thread_film_convert(*task);
196 else if(task->type == DeviceTask::SHADER)
197 thread_shader(*task);
200 class CPUDeviceTask : public DeviceTask {
202 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
205 run = function_bind(&CPUDevice::thread_run, device, this);
209 void thread_path_trace(DeviceTask& task)
211 if(task_pool.canceled()) {
212 if(task.need_finish_queue == false)
216 KernelGlobals kg = thread_kernel_globals_init();
219 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
221 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
222 if(system_cpu_support_avx2()) {
223 path_trace_kernel = kernel_cpu_avx2_path_trace;
227 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
228 if(system_cpu_support_avx()) {
229 path_trace_kernel = kernel_cpu_avx_path_trace;
233 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
234 if(system_cpu_support_sse41()) {
235 path_trace_kernel = kernel_cpu_sse41_path_trace;
239 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
240 if(system_cpu_support_sse3()) {
241 path_trace_kernel = kernel_cpu_sse3_path_trace;
245 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
246 if(system_cpu_support_sse2()) {
247 path_trace_kernel = kernel_cpu_sse2_path_trace;
252 path_trace_kernel = kernel_cpu_path_trace;
255 while(task.acquire_tile(this, tile)) {
256 float *render_buffer = (float*)tile.buffer;
257 uint *rng_state = (uint*)tile.rng_state;
258 int start_sample = tile.start_sample;
259 int end_sample = tile.start_sample + tile.num_samples;
261 for(int sample = start_sample; sample < end_sample; sample++) {
262 if(task.get_cancel() || task_pool.canceled()) {
263 if(task.need_finish_queue == false)
267 for(int y = tile.y; y < tile.y + tile.h; y++) {
268 for(int x = tile.x; x < tile.x + tile.w; x++) {
269 path_trace_kernel(&kg, render_buffer, rng_state,
270 sample, x, y, tile.offset, tile.stride);
274 tile.sample = sample + 1;
276 task.update_progress(&tile);
279 task.release_tile(tile);
281 if(task_pool.canceled()) {
282 if(task.need_finish_queue == false)
287 thread_kernel_globals_free(&kg);
290 void thread_film_convert(DeviceTask& task)
292 float sample_scale = 1.0f/(task.sample + 1);
295 void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
296 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
297 if(system_cpu_support_avx2()) {
298 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
302 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
303 if(system_cpu_support_avx()) {
304 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
308 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
309 if(system_cpu_support_sse41()) {
310 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
314 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
315 if(system_cpu_support_sse3()) {
316 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
320 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
321 if(system_cpu_support_sse2()) {
322 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
327 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
330 for(int y = task.y; y < task.y + task.h; y++)
331 for(int x = task.x; x < task.x + task.w; x++)
332 convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
333 sample_scale, x, y, task.offset, task.stride);
336 void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
337 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
338 if(system_cpu_support_avx2()) {
339 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
343 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
344 if(system_cpu_support_avx()) {
345 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
349 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
350 if(system_cpu_support_sse41()) {
351 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
355 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
356 if(system_cpu_support_sse3()) {
357 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
361 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
362 if(system_cpu_support_sse2()) {
363 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
368 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
371 for(int y = task.y; y < task.y + task.h; y++)
372 for(int x = task.x; x < task.x + task.w; x++)
373 convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
374 sample_scale, x, y, task.offset, task.stride);
379 void thread_shader(DeviceTask& task)
381 KernelGlobals kg = kernel_globals;
384 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
386 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
388 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
389 if(system_cpu_support_avx2()) {
390 shader_kernel = kernel_cpu_avx2_shader;
394 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
395 if(system_cpu_support_avx()) {
396 shader_kernel = kernel_cpu_avx_shader;
400 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
401 if(system_cpu_support_sse41()) {
402 shader_kernel = kernel_cpu_sse41_shader;
406 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
407 if(system_cpu_support_sse3()) {
408 shader_kernel = kernel_cpu_sse3_shader;
412 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
413 if(system_cpu_support_sse2()) {
414 shader_kernel = kernel_cpu_sse2_shader;
419 shader_kernel = kernel_cpu_shader;
422 for(int sample = 0; sample < task.num_samples; sample++) {
423 for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
425 (uint4*)task.shader_input,
426 (float4*)task.shader_output,
427 (float*)task.shader_output_luma,
428 task.shader_eval_type,
434 if(task.get_cancel() || task_pool.canceled())
437 task.update_progress(NULL);
442 OSLShader::thread_free(&kg);
446 int get_split_task_count(DeviceTask& task)
448 if(task.type == DeviceTask::SHADER)
449 return task.get_subtask_count(TaskScheduler::num_threads(), 256);
451 return task.get_subtask_count(TaskScheduler::num_threads());
454 void task_add(DeviceTask& task)
456 /* split task into smaller ones */
457 list<DeviceTask> tasks;
459 if(task.type == DeviceTask::SHADER)
460 task.split(tasks, TaskScheduler::num_threads(), 256);
462 task.split(tasks, TaskScheduler::num_threads());
464 foreach(DeviceTask& task, tasks)
465 task_pool.push(new CPUDeviceTask(this, task));
470 task_pool.wait_work();
479 inline KernelGlobals thread_kernel_globals_init()
481 KernelGlobals kg = kernel_globals;
482 kg.transparent_shadow_intersections = NULL;
483 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
484 sizeof(*kg.decoupled_volume_steps);
485 for(int i = 0; i < decoupled_count; ++i) {
486 kg.decoupled_volume_steps[i] = NULL;
488 kg.decoupled_volume_steps_index = 0;
490 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
495 inline void thread_kernel_globals_free(KernelGlobals *kg)
497 if(kg->transparent_shadow_intersections != NULL) {
498 free(kg->transparent_shadow_intersections);
500 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
501 sizeof(*kg->decoupled_volume_steps);
502 for(int i = 0; i < decoupled_count; ++i) {
503 if(kg->decoupled_volume_steps[i] != NULL) {
504 free(kg->decoupled_volume_steps[i]);
508 OSLShader::thread_free(kg);
513 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
515 return new CPUDevice(info, stats, background);
518 void device_cpu_info(vector<DeviceInfo>& devices)
522 info.type = DEVICE_CPU;
523 info.description = system_cpu_brand_string();
526 info.advanced_shading = true;
527 info.pack_images = false;
529 devices.insert(devices.begin(), info);
532 string device_cpu_capabilities(void)
534 string capabilities = "";
535 capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
536 capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
537 capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
538 capabilities += system_cpu_support_avx() ? "AVX " : "";
539 capabilities += system_cpu_support_avx2() ? "AVX2" : "";
540 if(capabilities[capabilities.size() - 1] == ' ')
541 capabilities.resize(capabilities.size() - 1);