Cycles Bake
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
index 1915245bb55db92b99a9120e265791e58786decb..c9cc7592028bdedc1fcfeb15d7eb667e2e6135d1 100644 (file)
@@ -1,19 +1,17 @@
 /*
- * Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
  */
 
 #include <stdlib.h>
@@ -47,11 +45,13 @@ class CPUDevice : public Device
 public:
        TaskPool task_pool;
        KernelGlobals kernel_globals;
+
 #ifdef WITH_OSL
        OSLGlobals osl_globals;
 #endif
        
-       CPUDevice(Stats &stats) : Device(stats)
+       CPUDevice(DeviceInfo& info, Stats &stats, bool background)
+       : Device(info, stats, background)
        {
 #ifdef WITH_OSL
                kernel_globals.osl = &osl_globals;
@@ -60,6 +60,8 @@ public:
                /* do now to avoid thread issues */
                system_cpu_support_sse2();
                system_cpu_support_sse3();
+               system_cpu_support_sse41();
+               system_cpu_support_avx();
        }
 
        ~CPUDevice()
@@ -67,11 +69,6 @@ public:
                task_pool.stop();
        }
 
-       bool support_advanced_shading()
-       {
-               return true;
-       }
-
        void mem_alloc(device_memory& mem, MemoryType type)
        {
                mem.device_pointer = mem.data_pointer;
@@ -106,9 +103,9 @@ public:
                kernel_const_copy(&kernel_globals, name, host, size);
        }
 
-       void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+       void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
        {
-               kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height);
+               kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
                mem.device_pointer = mem.data_pointer;
 
                stats.mem_alloc(mem.memory_size());
@@ -134,8 +131,8 @@ public:
        {
                if(task->type == DeviceTask::PATH_TRACE)
                        thread_path_trace(*task);
-               else if(task->type == DeviceTask::TONEMAP)
-                       thread_tonemap(*task);
+               else if(task->type == DeviceTask::FILM_CONVERT)
+                       thread_film_convert(*task);
                else if(task->type == DeviceTask::SHADER)
                        thread_shader(*task);
        }
@@ -151,7 +148,7 @@ public:
 
        void thread_path_trace(DeviceTask& task)
        {
-               if(task_pool.cancelled()) {
+               if(task_pool.canceled()) {
                        if(task.need_finish_queue == false)
                                return;
                }
@@ -170,17 +167,39 @@ public:
                        int start_sample = tile.start_sample;
                        int end_sample = tile.start_sample + tile.num_samples;
 
-#ifdef WITH_OPTIMIZED_KERNEL
-                       if(system_cpu_support_sse2()) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task.get_cancel() || task_pool.cancelled()) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
 
                                        for(int y = tile.y; y < tile.y + tile.h; y++) {
                                                for(int x = tile.x; x < tile.x + tile.w; x++) {
-                                                       kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
+                                                       kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
+                                                               sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state,
                                                                sample, x, y, tile.offset, tile.stride);
                                                }
                                        }
@@ -190,9 +209,12 @@ public:
                                        task.update_progress(tile);
                                }
                        }
-                       else if(system_cpu_support_sse3()) {
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+                       if(system_cpu_support_sse3()) {
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task.get_cancel() || task_pool.cancelled()) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
@@ -210,10 +232,32 @@ public:
                                }
                        }
                        else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
+                               for(int sample = start_sample; sample < end_sample; sample++) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
+                                               if(task.need_finish_queue == false)
+                                                       break;
+                                       }
+
+                                       for(int y = tile.y; y < tile.y + tile.h; y++) {
+                                               for(int x = tile.x; x < tile.x + tile.w; x++) {
+                                                       kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
+                                                               sample, x, y, tile.offset, tile.stride);
+                                               }
+                                       }
+
+                                       tile.sample = sample + 1;
+
+                                       task.update_progress(tile);
+                               }
+                       }
+                       else
 #endif
                        {
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task.get_cancel() || task_pool.cancelled()) {
+                                       if (task.get_cancel() || task_pool.canceled()) {
                                                if(task.need_finish_queue == false)
                                                        break;
                                        }
@@ -233,7 +277,7 @@ public:
 
                        task.release_tile(tile);
 
-                       if(task_pool.cancelled()) {
+                       if(task_pool.canceled()) {
                                if(task.need_finish_queue == false)
                                        break;
                        }
@@ -244,28 +288,97 @@ public:
 #endif
        }
 
-       void thread_tonemap(DeviceTask& task)
+       void thread_film_convert(DeviceTask& task)
        {
-#ifdef WITH_OPTIMIZED_KERNEL
-               if(system_cpu_support_sse2()) {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, task.resolution, x, y, task.offset, task.stride);
-               }
-               else if(system_cpu_support_sse3()) {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, task.resolution, x, y, task.offset, task.stride);
+               float sample_scale = 1.0f/(task.sample + 1);
+
+               if(task.rgba_half) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif         
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3               
+                       if(system_cpu_support_sse3()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+                       {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
                }
-               else
+               else {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+                       if(system_cpu_support_avx()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif         
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+                       if(system_cpu_support_sse41()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif                 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+                       if(system_cpu_support_sse3()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
 #endif
-               {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, task.resolution, x, y, task.offset, task.stride);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+                       if(system_cpu_support_sse2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+                       {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
                }
        }
 
@@ -277,20 +390,45 @@ public:
                OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
 
-#ifdef WITH_OPTIMIZED_KERNEL
-               if(system_cpu_support_sse2()) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+               if(system_cpu_support_avx()) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-                               kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+                               kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-                               if(task_pool.cancelled())
+                               if(task.get_cancel() || task_pool.canceled())
                                        break;
                        }
                }
-               else if(system_cpu_support_sse3()) {
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                      
+               if(system_cpu_support_sse41()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+                               if(task.get_cancel() || task_pool.canceled())
+                                       break;
+                       }
+               }
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+               if(system_cpu_support_sse3()) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
                                kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-                               if(task_pool.cancelled())
+                               if(task.get_cancel() || task_pool.canceled())
+                                       break;
+                       }
+               }
+               else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+               if(system_cpu_support_sse2()) {
+                       for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+                               kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+                               if(task.get_cancel() || task_pool.canceled())
                                        break;
                        }
                }
@@ -300,7 +438,7 @@ public:
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
                                kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-                               if(task_pool.cancelled())
+                               if(task.get_cancel() || task_pool.canceled())
                                        break;
                        }
                }
@@ -312,8 +450,7 @@ public:
 
        void task_add(DeviceTask& task)
        {
-               /* split task into smaller ones, more than number of threads for uneven
-                * workloads where some parts of the image render slower than others */
+               /* split task into smaller ones */
                list<DeviceTask> tasks;
                task.split(tasks, TaskScheduler::num_threads());
 
@@ -332,9 +469,9 @@ public:
        }
 };
 
-Device *device_cpu_create(DeviceInfo& info, Stats &stats)
+Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
-       return new CPUDevice(stats);
+       return new CPUDevice(info, stats, background);
 }
 
 void device_cpu_info(vector<DeviceInfo>& devices)