2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 /* So ImathMath is included before our kernel_cpu_compat. */
22 /* So no context pollution happens from indirectly included windows.h */
23 # include "util_windows.h"
24 # include <OSL/oslexec.h>
28 #include "device_intern.h"
31 #include "kernel_compat_cpu.h"
32 #include "kernel_types.h"
33 #include "kernel_globals.h"
35 #include "osl_shader.h"
36 #include "osl_globals.h"
40 #include "util_debug.h"
41 #include "util_foreach.h"
42 #include "util_function.h"
43 #include "util_logging.h"
44 #include "util_opengl.h"
45 #include "util_progress.h"
46 #include "util_system.h"
47 #include "util_thread.h"
51 class CPUDevice : public Device
55 KernelGlobals kernel_globals;
58 OSLGlobals osl_globals;
61 CPUDevice(DeviceInfo& info, Stats &stats, bool background)
62 : Device(info, stats, background)
65 kernel_globals.osl = &osl_globals;
68 /* do now to avoid thread issues */
69 system_cpu_support_sse2();
70 system_cpu_support_sse3();
71 system_cpu_support_sse41();
72 system_cpu_support_avx();
73 system_cpu_support_avx2();
75 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
76 if(system_cpu_support_avx2()) {
77 VLOG(1) << "Will be using AVX2 kernels.";
81 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
82 if(system_cpu_support_avx()) {
83 VLOG(1) << "Will be using AVX kernels.";
87 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
88 if(system_cpu_support_sse41()) {
89 VLOG(1) << "Will be using SSE4.1 kernels.";
93 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
94 if(system_cpu_support_sse3()) {
95 VLOG(1) << "Will be using SSE3kernels.";
99 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
100 if(system_cpu_support_sse2()) {
101 VLOG(1) << "Will be using SSE2 kernels.";
106 VLOG(1) << "Will be using regular kernels.";
115 void mem_alloc(device_memory& mem, MemoryType /*type*/)
117 mem.device_pointer = mem.data_pointer;
118 mem.device_size = mem.memory_size();
119 stats.mem_alloc(mem.device_size);
122 void mem_copy_to(device_memory& /*mem*/)
127 void mem_copy_from(device_memory& /*mem*/,
128 int /*y*/, int /*w*/, int /*h*/,
134 void mem_zero(device_memory& mem)
136 memset((void*)mem.device_pointer, 0, mem.memory_size());
139 void mem_free(device_memory& mem)
141 if(mem.device_pointer) {
142 mem.device_pointer = 0;
143 stats.mem_free(mem.device_size);
148 void const_copy_to(const char *name, void *host, size_t size)
150 kernel_const_copy(&kernel_globals, name, host, size);
153 void tex_alloc(const char *name,
155 InterpolationType interpolation,
156 ExtensionType extension)
158 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
159 kernel_tex_copy(&kernel_globals,
167 mem.device_pointer = mem.data_pointer;
168 mem.device_size = mem.memory_size();
169 stats.mem_alloc(mem.device_size);
172 void tex_free(device_memory& mem)
174 if(mem.device_pointer) {
175 mem.device_pointer = 0;
176 stats.mem_free(mem.device_size);
190 void thread_run(DeviceTask *task)
192 if(task->type == DeviceTask::PATH_TRACE)
193 thread_path_trace(*task);
194 else if(task->type == DeviceTask::FILM_CONVERT)
195 thread_film_convert(*task);
196 else if(task->type == DeviceTask::SHADER)
197 thread_shader(*task);
200 class CPUDeviceTask : public DeviceTask {
202 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
205 run = function_bind(&CPUDevice::thread_run, device, this);
209 void thread_path_trace(DeviceTask& task)
211 if(task_pool.canceled()) {
212 if(task.need_finish_queue == false)
216 KernelGlobals kg = kernel_globals;
219 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
224 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
226 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
227 if(system_cpu_support_avx2()) {
228 path_trace_kernel = kernel_cpu_avx2_path_trace;
232 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
233 if(system_cpu_support_avx()) {
234 path_trace_kernel = kernel_cpu_avx_path_trace;
238 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
239 if(system_cpu_support_sse41()) {
240 path_trace_kernel = kernel_cpu_sse41_path_trace;
244 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
245 if(system_cpu_support_sse3()) {
246 path_trace_kernel = kernel_cpu_sse3_path_trace;
250 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
251 if(system_cpu_support_sse2()) {
252 path_trace_kernel = kernel_cpu_sse2_path_trace;
257 path_trace_kernel = kernel_cpu_path_trace;
260 while(task.acquire_tile(this, tile)) {
261 float *render_buffer = (float*)tile.buffer;
262 uint *rng_state = (uint*)tile.rng_state;
263 int start_sample = tile.start_sample;
264 int end_sample = tile.start_sample + tile.num_samples;
266 for(int sample = start_sample; sample < end_sample; sample++) {
267 if(task.get_cancel() || task_pool.canceled()) {
268 if(task.need_finish_queue == false)
272 for(int y = tile.y; y < tile.y + tile.h; y++) {
273 for(int x = tile.x; x < tile.x + tile.w; x++) {
274 path_trace_kernel(&kg, render_buffer, rng_state,
275 sample, x, y, tile.offset, tile.stride);
279 tile.sample = sample + 1;
281 task.update_progress(&tile);
284 task.release_tile(tile);
286 if(task_pool.canceled()) {
287 if(task.need_finish_queue == false)
293 OSLShader::thread_free(&kg);
297 void thread_film_convert(DeviceTask& task)
299 float sample_scale = 1.0f/(task.sample + 1);
302 void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
303 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
304 if(system_cpu_support_avx2()) {
305 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
309 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
310 if(system_cpu_support_avx()) {
311 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
315 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
316 if(system_cpu_support_sse41()) {
317 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
321 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
322 if(system_cpu_support_sse3()) {
323 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
327 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
328 if(system_cpu_support_sse2()) {
329 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
334 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
337 for(int y = task.y; y < task.y + task.h; y++)
338 for(int x = task.x; x < task.x + task.w; x++)
339 convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
340 sample_scale, x, y, task.offset, task.stride);
343 void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
344 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
345 if(system_cpu_support_avx2()) {
346 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
350 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
351 if(system_cpu_support_avx()) {
352 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
356 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
357 if(system_cpu_support_sse41()) {
358 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
362 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
363 if(system_cpu_support_sse3()) {
364 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
368 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
369 if(system_cpu_support_sse2()) {
370 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
375 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
378 for(int y = task.y; y < task.y + task.h; y++)
379 for(int x = task.x; x < task.x + task.w; x++)
380 convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
381 sample_scale, x, y, task.offset, task.stride);
386 void thread_shader(DeviceTask& task)
388 KernelGlobals kg = kernel_globals;
391 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
393 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
395 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
396 if(system_cpu_support_avx2()) {
397 shader_kernel = kernel_cpu_avx2_shader;
401 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
402 if(system_cpu_support_avx()) {
403 shader_kernel = kernel_cpu_avx_shader;
407 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
408 if(system_cpu_support_sse41()) {
409 shader_kernel = kernel_cpu_sse41_shader;
413 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
414 if(system_cpu_support_sse3()) {
415 shader_kernel = kernel_cpu_sse3_shader;
419 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
420 if(system_cpu_support_sse2()) {
421 shader_kernel = kernel_cpu_sse2_shader;
426 shader_kernel = kernel_cpu_shader;
429 for(int sample = 0; sample < task.num_samples; sample++) {
430 for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
432 (uint4*)task.shader_input,
433 (float4*)task.shader_output,
434 (float*)task.shader_output_luma,
435 task.shader_eval_type,
441 if(task.get_cancel() || task_pool.canceled())
444 task.update_progress(NULL);
449 OSLShader::thread_free(&kg);
453 int get_split_task_count(DeviceTask& task)
455 if(task.type == DeviceTask::SHADER)
456 return task.get_subtask_count(TaskScheduler::num_threads(), 256);
458 return task.get_subtask_count(TaskScheduler::num_threads());
461 void task_add(DeviceTask& task)
463 /* split task into smaller ones */
464 list<DeviceTask> tasks;
466 if(task.type == DeviceTask::SHADER)
467 task.split(tasks, TaskScheduler::num_threads(), 256);
469 task.split(tasks, TaskScheduler::num_threads());
471 foreach(DeviceTask& task, tasks)
472 task_pool.push(new CPUDeviceTask(this, task));
477 task_pool.wait_work();
486 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
488 return new CPUDevice(info, stats, background);
491 void device_cpu_info(vector<DeviceInfo>& devices)
495 info.type = DEVICE_CPU;
496 info.description = system_cpu_brand_string();
499 info.advanced_shading = true;
500 info.pack_images = false;
502 devices.insert(devices.begin(), info);
505 string device_cpu_capabilities(void)
507 string capabilities = "";
508 capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
509 capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
510 capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
511 capabilities += system_cpu_support_avx() ? "AVX " : "";
512 capabilities += system_cpu_support_avx2() ? "AVX2" : "";
513 if(capabilities[capabilities.size() - 1] == ' ')
514 capabilities.resize(capabilities.size() - 1);