Merged revision(s) 59108-59184 from trunk/blender into soc-2013-dingto.
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #include <stdlib.h>
20 #include <string.h>
21
22 #include "device.h"
23 #include "device_intern.h"
24
25 #include "kernel.h"
26 #include "kernel_compat_cpu.h"
27 #include "kernel_types.h"
28 #include "kernel_globals.h"
29
30 #include "osl_shader.h"
31 #include "osl_globals.h"
32
33 #include "buffers.h"
34
35 #include "util_debug.h"
36 #include "util_foreach.h"
37 #include "util_function.h"
38 #include "util_opengl.h"
39 #include "util_progress.h"
40 #include "util_system.h"
41 #include "util_thread.h"
42
43 CCL_NAMESPACE_BEGIN
44
45 class CPUDevice : public Device
46 {
47 public:
48         TaskPool task_pool;
49         KernelGlobals kernel_globals;
50 #ifdef WITH_OSL
51         OSLGlobals osl_globals;
52 #endif
53         
54         CPUDevice(Stats &stats) : Device(stats)
55         {
56 #ifdef WITH_OSL
57                 kernel_globals.osl = &osl_globals;
58 #endif
59
60                 /* do now to avoid thread issues */
61                 system_cpu_support_sse2();
62                 system_cpu_support_sse3();
63         }
64
65         ~CPUDevice()
66         {
67                 task_pool.stop();
68         }
69
70         void mem_alloc(device_memory& mem, MemoryType type)
71         {
72                 mem.device_pointer = mem.data_pointer;
73
74                 stats.mem_alloc(mem.memory_size());
75         }
76
77         void mem_copy_to(device_memory& mem)
78         {
79                 /* no-op */
80         }
81
82         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
83         {
84                 /* no-op */
85         }
86
87         void mem_zero(device_memory& mem)
88         {
89                 memset((void*)mem.device_pointer, 0, mem.memory_size());
90         }
91
92         void mem_free(device_memory& mem)
93         {
94                 mem.device_pointer = 0;
95
96                 stats.mem_free(mem.memory_size());
97         }
98
99         void const_copy_to(const char *name, void *host, size_t size)
100         {
101                 kernel_const_copy(&kernel_globals, name, host, size);
102         }
103
104         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
105         {
106                 kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height);
107                 mem.device_pointer = mem.data_pointer;
108
109                 stats.mem_alloc(mem.memory_size());
110         }
111
112         void tex_free(device_memory& mem)
113         {
114                 mem.device_pointer = 0;
115
116                 stats.mem_free(mem.memory_size());
117         }
118
119         void *osl_memory()
120         {
121 #ifdef WITH_OSL
122                 return &osl_globals;
123 #else
124                 return NULL;
125 #endif
126         }
127
128         void thread_run(DeviceTask *task)
129         {
130                 if(task->type == DeviceTask::PATH_TRACE)
131                         thread_path_trace(*task);
132                 else if(task->type == DeviceTask::TONEMAP)
133                         thread_tonemap(*task);
134                 else if(task->type == DeviceTask::SHADER)
135                         thread_shader(*task);
136         }
137
138         class CPUDeviceTask : public DeviceTask {
139         public:
140                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
141                 : DeviceTask(task)
142                 {
143                         run = function_bind(&CPUDevice::thread_run, device, this);
144                 }
145         };
146
147         void thread_path_trace(DeviceTask& task)
148         {
149                 if(task_pool.cancelled()) {
150                         if(task.need_finish_queue == false)
151                                 return;
152                 }
153
154                 KernelGlobals kg = kernel_globals;
155
156 #ifdef WITH_OSL
157                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
158 #endif
159
160                 RenderTile tile;
161                 
162                 while(task.acquire_tile(this, tile)) {
163                         float *render_buffer = (float*)tile.buffer;
164                         uint *rng_state = (uint*)tile.rng_state;
165                         int start_sample = tile.start_sample;
166                         int end_sample = tile.start_sample + tile.num_samples;
167
168 #ifdef WITH_OPTIMIZED_KERNEL
169                         if(system_cpu_support_sse3()) {
170                                 for(int sample = start_sample; sample < end_sample; sample++) {
171                                         if (task.get_cancel() || task_pool.cancelled()) {
172                                                 if(task.need_finish_queue == false)
173                                                         break;
174                                         }
175
176                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
177                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
178                                                         kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
179                                                                 sample, x, y, tile.offset, tile.stride);
180                                                 }
181                                         }
182
183                                         tile.sample = sample + 1;
184
185                                         task.update_progress(tile);
186                                 }
187                         }
188                         else if(system_cpu_support_sse2()) {
189                                 for(int sample = start_sample; sample < end_sample; sample++) {
190                                         if (task.get_cancel() || task_pool.cancelled()) {
191                                                 if(task.need_finish_queue == false)
192                                                         break;
193                                         }
194
195                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
196                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
197                                                         kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
198                                                                 sample, x, y, tile.offset, tile.stride);
199                                                 }
200                                         }
201
202                                         tile.sample = sample + 1;
203
204                                         task.update_progress(tile);
205                                 }
206                         }
207                         else
208 #endif
209                         {
210                                 for(int sample = start_sample; sample < end_sample; sample++) {
211                                         if (task.get_cancel() || task_pool.cancelled()) {
212                                                 if(task.need_finish_queue == false)
213                                                         break;
214                                         }
215
216                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
217                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
218                                                         kernel_cpu_path_trace(&kg, render_buffer, rng_state,
219                                                                 sample, x, y, tile.offset, tile.stride);
220                                                 }
221                                         }
222
223                                         tile.sample = sample + 1;
224
225                                         task.update_progress(tile);
226                                 }
227                         }
228
229                         task.release_tile(tile);
230
231                         if(task_pool.cancelled()) {
232                                 if(task.need_finish_queue == false)
233                                         break;
234                         }
235                 }
236
237 #ifdef WITH_OSL
238                 OSLShader::thread_free(&kg);
239 #endif
240         }
241
242         void thread_tonemap(DeviceTask& task)
243         {
244 #ifdef WITH_OPTIMIZED_KERNEL
245                 if(system_cpu_support_sse3()) {
246                         for(int y = task.y; y < task.y + task.h; y++)
247                                 for(int x = task.x; x < task.x + task.w; x++)
248                                         kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
249                                                 task.sample, x, y, task.offset, task.stride);
250                 }
251                 else if(system_cpu_support_sse2()) {
252                         for(int y = task.y; y < task.y + task.h; y++)
253                                 for(int x = task.x; x < task.x + task.w; x++)
254                                         kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
255                                                 task.sample, x, y, task.offset, task.stride);
256                 }
257                 else
258 #endif
259                 {
260                         for(int y = task.y; y < task.y + task.h; y++)
261                                 for(int x = task.x; x < task.x + task.w; x++)
262                                         kernel_cpu_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
263                                                 task.sample, x, y, task.offset, task.stride);
264                 }
265         }
266
267         void thread_shader(DeviceTask& task)
268         {
269                 KernelGlobals kg = kernel_globals;
270
271 #ifdef WITH_OSL
272                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
273 #endif
274
275 #ifdef WITH_OPTIMIZED_KERNEL
276                 if(system_cpu_support_sse3()) {
277                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
278                                 kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
279
280                                 if(task_pool.cancelled())
281                                         break;
282                         }
283                 }
284                 else if(system_cpu_support_sse2()) {
285                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
286                                 kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
287
288                                 if(task_pool.cancelled())
289                                         break;
290                         }
291                 }
292                 else
293 #endif
294                 {
295                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
296                                 kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
297
298                                 if(task_pool.cancelled())
299                                         break;
300                         }
301                 }
302
303 #ifdef WITH_OSL
304                 OSLShader::thread_free(&kg);
305 #endif
306         }
307
308         void task_add(DeviceTask& task)
309         {
310                 /* split task into smaller ones */
311                 list<DeviceTask> tasks;
312                 task.split(tasks, TaskScheduler::num_threads());
313
314                 foreach(DeviceTask& task, tasks)
315                         task_pool.push(new CPUDeviceTask(this, task));
316         }
317
318         void task_wait()
319         {
320                 task_pool.wait_work();
321         }
322
323         void task_cancel()
324         {
325                 task_pool.cancel();
326         }
327 };
328
329 Device *device_cpu_create(DeviceInfo& info, Stats &stats)
330 {
331         return new CPUDevice(stats);
332 }
333
334 void device_cpu_info(vector<DeviceInfo>& devices)
335 {
336         DeviceInfo info;
337
338         info.type = DEVICE_CPU;
339         info.description = system_cpu_brand_string();
340         info.id = "CPU";
341         info.num = 0;
342         info.extended_images = true;
343         info.advanced_shading = true;
344         info.pack_images = false;
345
346         devices.insert(devices.begin(), info);
347 }
348
349 CCL_NAMESPACE_END
350