Fix cycles not using SSE3 kernel after recent, order with SSE2 should be switched,
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #include <stdlib.h>
20 #include <string.h>
21
22 #include "device.h"
23 #include "device_intern.h"
24
25 #include "kernel.h"
26 #include "kernel_compat_cpu.h"
27 #include "kernel_types.h"
28 #include "kernel_globals.h"
29
30 #include "osl_shader.h"
31 #include "osl_globals.h"
32
33 #include "buffers.h"
34
35 #include "util_debug.h"
36 #include "util_foreach.h"
37 #include "util_function.h"
38 #include "util_opengl.h"
39 #include "util_progress.h"
40 #include "util_system.h"
41 #include "util_thread.h"
42
43 CCL_NAMESPACE_BEGIN
44
45 class CPUDevice : public Device
46 {
47 public:
48         TaskPool task_pool;
49         KernelGlobals kernel_globals;
50 #ifdef WITH_OSL
51         OSLGlobals osl_globals;
52 #endif
53         
54         CPUDevice(Stats &stats) : Device(stats)
55         {
56 #ifdef WITH_OSL
57                 kernel_globals.osl = &osl_globals;
58 #endif
59
60                 /* do now to avoid thread issues */
61                 system_cpu_support_sse2();
62                 system_cpu_support_sse3();
63         }
64
65         ~CPUDevice()
66         {
67                 task_pool.stop();
68         }
69
70         bool support_advanced_shading()
71         {
72                 return true;
73         }
74
75         void mem_alloc(device_memory& mem, MemoryType type)
76         {
77                 mem.device_pointer = mem.data_pointer;
78
79                 stats.mem_alloc(mem.memory_size());
80         }
81
82         void mem_copy_to(device_memory& mem)
83         {
84                 /* no-op */
85         }
86
87         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
88         {
89                 /* no-op */
90         }
91
92         void mem_zero(device_memory& mem)
93         {
94                 memset((void*)mem.device_pointer, 0, mem.memory_size());
95         }
96
97         void mem_free(device_memory& mem)
98         {
99                 mem.device_pointer = 0;
100
101                 stats.mem_free(mem.memory_size());
102         }
103
104         void const_copy_to(const char *name, void *host, size_t size)
105         {
106                 kernel_const_copy(&kernel_globals, name, host, size);
107         }
108
109         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
110         {
111                 kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height);
112                 mem.device_pointer = mem.data_pointer;
113
114                 stats.mem_alloc(mem.memory_size());
115         }
116
117         void tex_free(device_memory& mem)
118         {
119                 mem.device_pointer = 0;
120
121                 stats.mem_free(mem.memory_size());
122         }
123
124         void *osl_memory()
125         {
126 #ifdef WITH_OSL
127                 return &osl_globals;
128 #else
129                 return NULL;
130 #endif
131         }
132
133         void thread_run(DeviceTask *task)
134         {
135                 if(task->type == DeviceTask::PATH_TRACE)
136                         thread_path_trace(*task);
137                 else if(task->type == DeviceTask::TONEMAP)
138                         thread_tonemap(*task);
139                 else if(task->type == DeviceTask::SHADER)
140                         thread_shader(*task);
141         }
142
143         class CPUDeviceTask : public DeviceTask {
144         public:
145                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
146                 : DeviceTask(task)
147                 {
148                         run = function_bind(&CPUDevice::thread_run, device, this);
149                 }
150         };
151
152         void thread_path_trace(DeviceTask& task)
153         {
154                 if(task_pool.cancelled()) {
155                         if(task.need_finish_queue == false)
156                                 return;
157                 }
158
159                 KernelGlobals kg = kernel_globals;
160
161 #ifdef WITH_OSL
162                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
163 #endif
164
165                 RenderTile tile;
166                 
167                 while(task.acquire_tile(this, tile)) {
168                         float *render_buffer = (float*)tile.buffer;
169                         uint *rng_state = (uint*)tile.rng_state;
170                         int start_sample = tile.start_sample;
171                         int end_sample = tile.start_sample + tile.num_samples;
172
173 #ifdef WITH_OPTIMIZED_KERNEL
174                         if(system_cpu_support_sse3()) {
175                                 for(int sample = start_sample; sample < end_sample; sample++) {
176                                         if (task.get_cancel() || task_pool.cancelled()) {
177                                                 if(task.need_finish_queue == false)
178                                                         break;
179                                         }
180
181                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
182                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
183                                                         kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
184                                                                 sample, x, y, tile.offset, tile.stride);
185                                                 }
186                                         }
187
188                                         tile.sample = sample + 1;
189
190                                         task.update_progress(tile);
191                                 }
192                         }
193                         else if(system_cpu_support_sse2()) {
194                                 for(int sample = start_sample; sample < end_sample; sample++) {
195                                         if (task.get_cancel() || task_pool.cancelled()) {
196                                                 if(task.need_finish_queue == false)
197                                                         break;
198                                         }
199
200                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
201                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
202                                                         kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
203                                                                 sample, x, y, tile.offset, tile.stride);
204                                                 }
205                                         }
206
207                                         tile.sample = sample + 1;
208
209                                         task.update_progress(tile);
210                                 }
211                         }
212                         else
213 #endif
214                         {
215                                 for(int sample = start_sample; sample < end_sample; sample++) {
216                                         if (task.get_cancel() || task_pool.cancelled()) {
217                                                 if(task.need_finish_queue == false)
218                                                         break;
219                                         }
220
221                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
222                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
223                                                         kernel_cpu_path_trace(&kg, render_buffer, rng_state,
224                                                                 sample, x, y, tile.offset, tile.stride);
225                                                 }
226                                         }
227
228                                         tile.sample = sample + 1;
229
230                                         task.update_progress(tile);
231                                 }
232                         }
233
234                         task.release_tile(tile);
235
236                         if(task_pool.cancelled()) {
237                                 if(task.need_finish_queue == false)
238                                         break;
239                         }
240                 }
241
242 #ifdef WITH_OSL
243                 OSLShader::thread_free(&kg);
244 #endif
245         }
246
247         void thread_tonemap(DeviceTask& task)
248         {
249 #ifdef WITH_OPTIMIZED_KERNEL
250                 if(system_cpu_support_sse3()) {
251                         for(int y = task.y; y < task.y + task.h; y++)
252                                 for(int x = task.x; x < task.x + task.w; x++)
253                                         kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
254                                                 task.sample, task.resolution, x, y, task.offset, task.stride);
255                 }
256                 else if(system_cpu_support_sse2()) {
257                         for(int y = task.y; y < task.y + task.h; y++)
258                                 for(int x = task.x; x < task.x + task.w; x++)
259                                         kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
260                                                 task.sample, task.resolution, x, y, task.offset, task.stride);
261                 }
262                 else
263 #endif
264                 {
265                         for(int y = task.y; y < task.y + task.h; y++)
266                                 for(int x = task.x; x < task.x + task.w; x++)
267                                         kernel_cpu_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
268                                                 task.sample, task.resolution, x, y, task.offset, task.stride);
269                 }
270         }
271
272         void thread_shader(DeviceTask& task)
273         {
274                 KernelGlobals kg = kernel_globals;
275
276 #ifdef WITH_OSL
277                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
278 #endif
279
280 #ifdef WITH_OPTIMIZED_KERNEL
281                 if(system_cpu_support_sse3()) {
282                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
283                                 kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
284
285                                 if(task_pool.cancelled())
286                                         break;
287                         }
288                 }
289                 else if(system_cpu_support_sse2()) {
290                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
291                                 kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
292
293                                 if(task_pool.cancelled())
294                                         break;
295                         }
296                 }
297                 else
298 #endif
299                 {
300                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
301                                 kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
302
303                                 if(task_pool.cancelled())
304                                         break;
305                         }
306                 }
307
308 #ifdef WITH_OSL
309                 OSLShader::thread_free(&kg);
310 #endif
311         }
312
313         void task_add(DeviceTask& task)
314         {
315                 /* split task into smaller ones, more than number of threads for uneven
316                  * workloads where some parts of the image render slower than others */
317                 list<DeviceTask> tasks;
318                 task.split(tasks, TaskScheduler::num_threads());
319
320                 foreach(DeviceTask& task, tasks)
321                         task_pool.push(new CPUDeviceTask(this, task));
322         }
323
324         void task_wait()
325         {
326                 task_pool.wait_work();
327         }
328
329         void task_cancel()
330         {
331                 task_pool.cancel();
332         }
333 };
334
335 Device *device_cpu_create(DeviceInfo& info, Stats &stats)
336 {
337         return new CPUDevice(stats);
338 }
339
340 void device_cpu_info(vector<DeviceInfo>& devices)
341 {
342         DeviceInfo info;
343
344         info.type = DEVICE_CPU;
345         info.description = system_cpu_brand_string();
346         info.id = "CPU";
347         info.num = 0;
348         info.advanced_shading = true;
349         info.pack_images = false;
350
351         devices.insert(devices.begin(), info);
352 }
353
354 CCL_NAMESPACE_END
355