Cycles Bake
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
index f2743e15e2b7f364eedb13b92306aeb3bb78091f..c9cc7592028bdedc1fcfeb15d7eb667e2e6135d1 100644 (file)
@@ -1,19 +1,17 @@
 /*
 /*
- * Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
  *
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
  */
 
 #include <stdlib.h>
  */
 
 #include <stdlib.h>
@@ -47,11 +45,13 @@ class CPUDevice : public Device
 public:
        TaskPool task_pool;
        KernelGlobals kernel_globals;
 public:
        TaskPool task_pool;
        KernelGlobals kernel_globals;
+
 #ifdef WITH_OSL
        OSLGlobals osl_globals;
 #endif
        
 #ifdef WITH_OSL
        OSLGlobals osl_globals;
 #endif
        
-       CPUDevice(Stats &stats) : Device(stats)
+       CPUDevice(DeviceInfo& info, Stats &stats, bool background)
+       : Device(info, stats, background)
        {
 #ifdef WITH_OSL
                kernel_globals.osl = &osl_globals;
        {
 #ifdef WITH_OSL
                kernel_globals.osl = &osl_globals;
@@ -60,6 +60,8 @@ public:
                /* do now to avoid thread issues */
                system_cpu_support_sse2();
                system_cpu_support_sse3();
                /* do now to avoid thread issues */
                system_cpu_support_sse2();
                system_cpu_support_sse3();
+               system_cpu_support_sse41();
+               system_cpu_support_avx();
        }
 
        ~CPUDevice()
        }
 
        ~CPUDevice()
@@ -101,9 +103,9 @@ public:
                kernel_const_copy(&kernel_globals, name, host, size);
        }
 
                kernel_const_copy(&kernel_globals, name, host, size);
        }
 
-       void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+       void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
        {
        {
-               kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height);
+               kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
                mem.device_pointer = mem.data_pointer;
 
                stats.mem_alloc(mem.memory_size());
                mem.device_pointer = mem.data_pointer;
 
                stats.mem_alloc(mem.memory_size());
@@ -129,8 +131,8 @@ public:
        {
                if(task->type == DeviceTask::PATH_TRACE)
                        thread_path_trace(*task);
        {
                if(task->type == DeviceTask::PATH_TRACE)
                        thread_path_trace(*task);
-               else if(task->type == DeviceTask::TONEMAP)
-                       thread_tonemap(*task);
+               else if(task->type == DeviceTask::FILM_CONVERT)
+                       thread_film_convert(*task);
                else if(task->type == DeviceTask::SHADER)
                        thread_shader(*task);
        }
                else if(task->type == DeviceTask::SHADER)
                        thread_shader(*task);
        }
@@ -146,7 +148,7 @@ public:
 
        void thread_path_trace(DeviceTask& task)
        {
 
        void thread_path_trace(DeviceTask& task)
        {
-               if(task_pool.cancelled()) {
+               if(task_pool.canceled()) {
                        if(task.need_finish_queue == false)
                                return;
                }
                        if(task.need_finish_queue == false)
                                return;
                }
@@ -165,10 +167,54 @@ public:
                        int start_sample = tile.start_sample;
                        int end_sample = tile.start_sample + tile.num_samples;
 
                        int start_sample = tile.start_sample;
                        int end_sample = tile.start_sample + tile.num_samples;
 
-#ifdef WITH_OPTIMIZED_KERNEL
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
+                                                               sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state,
+                                                               sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
                        if(system_cpu_support_sse3()) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
                        if(system_cpu_support_sse3()) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task.get_cancel() || task_pool.cancelled()) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
@@ -185,9 +231,12 @@ public:
                                        task.update_progress(tile);
                                }
                        }
                                        task.update_progress(tile);
                                }
                        }
-                       else if(system_cpu_support_sse2()) {
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task.get_cancel() || task_pool.cancelled()) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
@@ -208,7 +257,7 @@ public:
 #endif
                        {
                                for(int sample = start_sample; sample < end_sample; sample++) {
 #endif
                        {
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task.get_cancel() || task_pool.cancelled()) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
@@ -228,7 +277,7 @@ public:
 
                        task.release_tile(tile);
 
 
                        task.release_tile(tile);
 
-                       if(task_pool.cancelled()) {
+                       if(task_pool.canceled()) {
                                if(task.need_finish_queue == false)
                                        break;
                        }
                                if(task.need_finish_queue == false)
                                        break;
                        }
@@ -239,28 +288,97 @@ public:
 #endif
        }
 
 #endif
        }
 
-       void thread_tonemap(DeviceTask& task)
+       void thread_film_convert(DeviceTask& task)
        {
        {
-#ifdef WITH_OPTIMIZED_KERNEL
-               if(system_cpu_support_sse3()) {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, x, y, task.offset, task.stride);
-               }
-               else if(system_cpu_support_sse2()) {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, x, y, task.offset, task.stride);
+               float sample_scale = 1.0f/(task.sample + 1);
+
+               if(task.rgba_half) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif         
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3               
+                       if(system_cpu_support_sse3()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+                       {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
                }
                }
-               else
+               else {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif         
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif                 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+                       if(system_cpu_support_sse3()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
 #endif
 #endif
-               {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, x, y, task.offset, task.stride);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+                       {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
                }
        }
 
                }
        }
 
@@ -272,20 +390,45 @@ public:
                OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
 
                OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
 
-#ifdef WITH_OPTIMIZED_KERNEL
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+               if(system_cpu_support_avx()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+                               if(task.get_cancel() || task_pool.canceled())
+                                       break;
+                       }
+               }
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+               if(system_cpu_support_sse41()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+                               if(task.get_cancel() || task_pool.canceled())
+                                       break;
+                       }
+               }
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
                if(system_cpu_support_sse3()) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
                                kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
                if(system_cpu_support_sse3()) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
                                kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-                               if(task_pool.cancelled())
+                               if(task.get_cancel() || task_pool.canceled())
                                        break;
                        }
                }
                                        break;
                        }
                }
-               else if(system_cpu_support_sse2()) {
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+               if(system_cpu_support_sse2()) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
                                kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
                                kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-                               if(task_pool.cancelled())
+                               if(task.get_cancel() || task_pool.canceled())
                                        break;
                        }
                }
                                        break;
                        }
                }
@@ -295,7 +438,7 @@ public:
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
                                kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
                                kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-                               if(task_pool.cancelled())
+                               if(task.get_cancel() || task_pool.canceled())
                                        break;
                        }
                }
                                        break;
                        }
                }
@@ -326,9 +469,9 @@ public:
        }
 };
 
        }
 };
 
-Device *device_cpu_create(DeviceInfo& info, Stats &stats)
+Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 {
-       return new CPUDevice(stats);
+       return new CPUDevice(info, stats, background);
 }
 
 void device_cpu_info(vector<DeviceInfo>& devices)
 }
 
 void device_cpu_info(vector<DeviceInfo>& devices)
@@ -339,7 +482,6 @@ void device_cpu_info(vector<DeviceInfo>& devices)
        info.description = system_cpu_brand_string();
        info.id = "CPU";
        info.num = 0;
        info.description = system_cpu_brand_string();
        info.id = "CPU";
        info.num = 0;
-       info.extended_images = true;
        info.advanced_shading = true;
        info.pack_images = false;
 
        info.advanced_shading = true;
        info.pack_images = false;