Cycles Bake
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
index e2f612ee2335e4f08f9f4102b14dd84af644d8bd..c9cc7592028bdedc1fcfeb15d7eb667e2e6135d1 100644 (file)
@@ -1,19 +1,17 @@
 /*
- * Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
  */
 
 #include <stdlib.h>
 #include "device_intern.h"
 
 #include "kernel.h"
+#include "kernel_compat_cpu.h"
 #include "kernel_types.h"
+#include "kernel_globals.h"
 
 #include "osl_shader.h"
+#include "osl_globals.h"
 
 #include "buffers.h"
 
@@ -43,30 +44,36 @@ class CPUDevice : public Device
 {
 public:
        TaskPool task_pool;
-       KernelGlobals *kg;
+       KernelGlobals kernel_globals;
+
+#ifdef WITH_OSL
+       OSLGlobals osl_globals;
+#endif
        
-       CPUDevice(int threads_num)
+       CPUDevice(DeviceInfo& info, Stats &stats, bool background)
+       : Device(info, stats, background)
        {
-               kg = kernel_globals_create();
+#ifdef WITH_OSL
+               kernel_globals.osl = &osl_globals;
+#endif
 
                /* do now to avoid thread issues */
-               system_cpu_support_optimized();
+               system_cpu_support_sse2();
+               system_cpu_support_sse3();
+               system_cpu_support_sse41();
+               system_cpu_support_avx();
        }
 
        ~CPUDevice()
        {
                task_pool.stop();
-               kernel_globals_free(kg);
-       }
-
-       bool support_advanced_shading()
-       {
-               return true;
        }
 
        void mem_alloc(device_memory& mem, MemoryType type)
        {
                mem.device_pointer = mem.data_pointer;
+
+               stats.mem_alloc(mem.memory_size());
        }
 
        void mem_copy_to(device_memory& mem)
@@ -87,28 +94,34 @@ public:
        void mem_free(device_memory& mem)
        {
                mem.device_pointer = 0;
+
+               stats.mem_free(mem.memory_size());
        }
 
        void const_copy_to(const char *name, void *host, size_t size)
        {
-               kernel_const_copy(kg, name, host, size);
+               kernel_const_copy(&kernel_globals, name, host, size);
        }
 
-       void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+       void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
        {
-               kernel_tex_copy(kg, name, mem.data_pointer, mem.data_width, mem.data_height);
+               kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
                mem.device_pointer = mem.data_pointer;
+
+               stats.mem_alloc(mem.memory_size());
        }
 
        void tex_free(device_memory& mem)
        {
                mem.device_pointer = 0;
+
+               stats.mem_free(mem.memory_size());
        }
 
        void *osl_memory()
        {
 #ifdef WITH_OSL
-               return kernel_osl_memory(kg);
+               return &osl_globals;
 #else
                return NULL;
 #endif
@@ -118,8 +131,8 @@ public:
        {
                if(task->type == DeviceTask::PATH_TRACE)
                        thread_path_trace(*task);
-               else if(task->type == DeviceTask::TONEMAP)
-                       thread_tonemap(*task);
+               else if(task->type == DeviceTask::FILM_CONVERT)
+                       thread_film_convert(*task);
                else if(task->type == DeviceTask::SHADER)
                        thread_shader(*task);
        }
@@ -135,12 +148,15 @@ public:
 
        void thread_path_trace(DeviceTask& task)
        {
-               if(task_pool.cancelled())
-                       return;
+               if(task_pool.canceled()) {
+                       if(task.need_finish_queue == false)
+                               return;
+               }
+
+               KernelGlobals kg = kernel_globals;
 
 #ifdef WITH_OSL
-               if(kernel_osl_use(kg))
-                       OSLShader::thread_init(kg);
+               OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
 
                RenderTile tile;
@@ -151,15 +167,83 @@ public:
                        int start_sample = tile.start_sample;
                        int end_sample = tile.start_sample + tile.num_samples;
 
-#ifdef WITH_OPTIMIZED_KERNEL
-                       if(system_cpu_support_optimized()) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
+                                                               sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state,
+                                                               sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+                       if(system_cpu_support_sse3()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
+                                                               sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task.get_cancel() || task_pool.cancelled())
-                                               break;
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
 
                                        for(int y = tile.y; y < tile.y + tile.h; y++) {
                                                for(int x = tile.x; x < tile.x + tile.w; x++) {
-                                                       kernel_cpu_optimized_path_trace(kg, render_buffer, rng_state,
+                                                       kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
                                                                sample, x, y, tile.offset, tile.stride);
                                                }
                                        }
@@ -173,12 +257,14 @@ public:
 #endif
                        {
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task.get_cancel() || task_pool.cancelled())
-                                               break;
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
 
                                        for(int y = tile.y; y < tile.y + tile.h; y++) {
                                                for(int x = tile.x; x < tile.x + tile.w; x++) {
-                                                       kernel_cpu_path_trace(kg, render_buffer, rng_state,
+                                                       kernel_cpu_path_trace(&kg, render_buffer, rng_state,
                                                                sample, x, y, tile.offset, tile.stride);
                                                }
                                        }
@@ -191,48 +277,158 @@ public:
 
                        task.release_tile(tile);
 
-                       if(task_pool.cancelled())
-                               break;
+                       if(task_pool.canceled()) {
+                               if(task.need_finish_queue == false)
+                                       break;
+                       }
                }
 
 #ifdef WITH_OSL
-               if(kernel_osl_use(kg))
-                       OSLShader::thread_free(kg);
+               OSLShader::thread_free(&kg);
 #endif
        }
 
-       void thread_tonemap(DeviceTask& task)
+       void thread_film_convert(DeviceTask& task)
        {
-#ifdef WITH_OPTIMIZED_KERNEL
-               if(system_cpu_support_optimized()) {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, task.resolution, x, y, task.offset, task.stride);
+               float sample_scale = 1.0f/(task.sample + 1);
+
+               if(task.rgba_half) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif         
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3               
+                       if(system_cpu_support_sse3()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+                       {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
                }
-               else
+               else {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif         
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif                 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+                       if(system_cpu_support_sse3()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
 #endif
-               {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, task.resolution, x, y, task.offset, task.stride);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+                       {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
                }
        }
 
        void thread_shader(DeviceTask& task)
        {
+               KernelGlobals kg = kernel_globals;
+
 #ifdef WITH_OSL
-               if(kernel_osl_use(kg))
-                       OSLShader::thread_init(kg);
+               OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
 
-#ifdef WITH_OPTIMIZED_KERNEL
-               if(system_cpu_support_optimized()) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+               if(system_cpu_support_avx()) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-                               kernel_cpu_optimized_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+                               kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-                               if(task_pool.cancelled())
+                               if(task.get_cancel() || task_pool.canceled())
+                                       break;
+                       }
+               }
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+               if(system_cpu_support_sse41()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+                               if(task.get_cancel() || task_pool.canceled())
+                                       break;
+                       }
+               }
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+               if(system_cpu_support_sse3()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+                               if(task.get_cancel() || task_pool.canceled())
+                                       break;
+                       }
+               }
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+               if(system_cpu_support_sse2()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+                               if(task.get_cancel() || task_pool.canceled())
                                        break;
                        }
                }
@@ -240,25 +436,23 @@ public:
 #endif
                {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-                               kernel_cpu_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+                               kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-                               if(task_pool.cancelled())
+                               if(task.get_cancel() || task_pool.canceled())
                                        break;
                        }
                }
 
 #ifdef WITH_OSL
-               if(kernel_osl_use(kg))
-                       OSLShader::thread_free(kg);
+               OSLShader::thread_free(&kg);
 #endif
        }
 
        void task_add(DeviceTask& task)
        {
-               /* split task into smaller ones, more than number of threads for uneven
-                * workloads where some parts of the image render slower than others */
+               /* split task into smaller ones */
                list<DeviceTask> tasks;
-               task.split(tasks, TaskScheduler::num_threads()+1);
+               task.split(tasks, TaskScheduler::num_threads());
 
                foreach(DeviceTask& task, tasks)
                        task_pool.push(new CPUDeviceTask(this, task));
@@ -273,16 +467,11 @@ public:
        {
                task_pool.cancel();
        }
-
-       bool task_cancelled()
-       {
-               return task_pool.cancelled();
-       }
 };
 
-Device *device_cpu_create(DeviceInfo& info, int threads)
+Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
-       return new CPUDevice(threads);
+       return new CPUDevice(info, stats, background);
 }
 
 void device_cpu_info(vector<DeviceInfo>& devices)