Merging r59182 through r59257 from trunk into soc-2013-depsgraph_mt
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 #include "device.h"
21 #include "device_intern.h"
22
23 #include "kernel.h"
24 #include "kernel_compat_cpu.h"
25 #include "kernel_types.h"
26 #include "kernel_globals.h"
27
28 #include "osl_shader.h"
29 #include "osl_globals.h"
30
31 #include "buffers.h"
32
33 #include "util_debug.h"
34 #include "util_foreach.h"
35 #include "util_function.h"
36 #include "util_opengl.h"
37 #include "util_progress.h"
38 #include "util_system.h"
39 #include "util_thread.h"
40
41 CCL_NAMESPACE_BEGIN
42
43 class CPUDevice : public Device
44 {
45 public:
46         TaskPool task_pool;
47         KernelGlobals kernel_globals;
48 #ifdef WITH_OSL
49         OSLGlobals osl_globals;
50 #endif
51         
52         CPUDevice(Stats &stats) : Device(stats)
53         {
54 #ifdef WITH_OSL
55                 kernel_globals.osl = &osl_globals;
56 #endif
57
58                 /* do now to avoid thread issues */
59                 system_cpu_support_sse2();
60                 system_cpu_support_sse3();
61         }
62
63         ~CPUDevice()
64         {
65                 task_pool.stop();
66         }
67
68         void mem_alloc(device_memory& mem, MemoryType type)
69         {
70                 mem.device_pointer = mem.data_pointer;
71
72                 stats.mem_alloc(mem.memory_size());
73         }
74
75         void mem_copy_to(device_memory& mem)
76         {
77                 /* no-op */
78         }
79
80         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
81         {
82                 /* no-op */
83         }
84
85         void mem_zero(device_memory& mem)
86         {
87                 memset((void*)mem.device_pointer, 0, mem.memory_size());
88         }
89
90         void mem_free(device_memory& mem)
91         {
92                 mem.device_pointer = 0;
93
94                 stats.mem_free(mem.memory_size());
95         }
96
97         void const_copy_to(const char *name, void *host, size_t size)
98         {
99                 kernel_const_copy(&kernel_globals, name, host, size);
100         }
101
102         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
103         {
104                 kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height);
105                 mem.device_pointer = mem.data_pointer;
106
107                 stats.mem_alloc(mem.memory_size());
108         }
109
110         void tex_free(device_memory& mem)
111         {
112                 mem.device_pointer = 0;
113
114                 stats.mem_free(mem.memory_size());
115         }
116
117         void *osl_memory()
118         {
119 #ifdef WITH_OSL
120                 return &osl_globals;
121 #else
122                 return NULL;
123 #endif
124         }
125
126         void thread_run(DeviceTask *task)
127         {
128                 if(task->type == DeviceTask::PATH_TRACE)
129                         thread_path_trace(*task);
130                 else if(task->type == DeviceTask::TONEMAP)
131                         thread_tonemap(*task);
132                 else if(task->type == DeviceTask::SHADER)
133                         thread_shader(*task);
134         }
135
136         class CPUDeviceTask : public DeviceTask {
137         public:
138                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
139                 : DeviceTask(task)
140                 {
141                         run = function_bind(&CPUDevice::thread_run, device, this);
142                 }
143         };
144
145         void thread_path_trace(DeviceTask& task)
146         {
147                 if(task_pool.cancelled()) {
148                         if(task.need_finish_queue == false)
149                                 return;
150                 }
151
152                 KernelGlobals kg = kernel_globals;
153
154 #ifdef WITH_OSL
155                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
156 #endif
157
158                 RenderTile tile;
159                 
160                 while(task.acquire_tile(this, tile)) {
161                         float *render_buffer = (float*)tile.buffer;
162                         uint *rng_state = (uint*)tile.rng_state;
163                         int start_sample = tile.start_sample;
164                         int end_sample = tile.start_sample + tile.num_samples;
165
166 #ifdef WITH_OPTIMIZED_KERNEL
167                         if(system_cpu_support_sse3()) {
168                                 for(int sample = start_sample; sample < end_sample; sample++) {
169                                         if (task.get_cancel() || task_pool.cancelled()) {
170                                                 if(task.need_finish_queue == false)
171                                                         break;
172                                         }
173
174                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
175                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
176                                                         kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
177                                                                 sample, x, y, tile.offset, tile.stride);
178                                                 }
179                                         }
180
181                                         tile.sample = sample + 1;
182
183                                         task.update_progress(tile);
184                                 }
185                         }
186                         else if(system_cpu_support_sse2()) {
187                                 for(int sample = start_sample; sample < end_sample; sample++) {
188                                         if (task.get_cancel() || task_pool.cancelled()) {
189                                                 if(task.need_finish_queue == false)
190                                                         break;
191                                         }
192
193                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
194                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
195                                                         kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
196                                                                 sample, x, y, tile.offset, tile.stride);
197                                                 }
198                                         }
199
200                                         tile.sample = sample + 1;
201
202                                         task.update_progress(tile);
203                                 }
204                         }
205                         else
206 #endif
207                         {
208                                 for(int sample = start_sample; sample < end_sample; sample++) {
209                                         if (task.get_cancel() || task_pool.cancelled()) {
210                                                 if(task.need_finish_queue == false)
211                                                         break;
212                                         }
213
214                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
215                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
216                                                         kernel_cpu_path_trace(&kg, render_buffer, rng_state,
217                                                                 sample, x, y, tile.offset, tile.stride);
218                                                 }
219                                         }
220
221                                         tile.sample = sample + 1;
222
223                                         task.update_progress(tile);
224                                 }
225                         }
226
227                         task.release_tile(tile);
228
229                         if(task_pool.cancelled()) {
230                                 if(task.need_finish_queue == false)
231                                         break;
232                         }
233                 }
234
235 #ifdef WITH_OSL
236                 OSLShader::thread_free(&kg);
237 #endif
238         }
239
240         void thread_tonemap(DeviceTask& task)
241         {
242 #ifdef WITH_OPTIMIZED_KERNEL
243                 if(system_cpu_support_sse3()) {
244                         for(int y = task.y; y < task.y + task.h; y++)
245                                 for(int x = task.x; x < task.x + task.w; x++)
246                                         kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
247                                                 task.sample, x, y, task.offset, task.stride);
248                 }
249                 else if(system_cpu_support_sse2()) {
250                         for(int y = task.y; y < task.y + task.h; y++)
251                                 for(int x = task.x; x < task.x + task.w; x++)
252                                         kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
253                                                 task.sample, x, y, task.offset, task.stride);
254                 }
255                 else
256 #endif
257                 {
258                         for(int y = task.y; y < task.y + task.h; y++)
259                                 for(int x = task.x; x < task.x + task.w; x++)
260                                         kernel_cpu_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
261                                                 task.sample, x, y, task.offset, task.stride);
262                 }
263         }
264
265         void thread_shader(DeviceTask& task)
266         {
267                 KernelGlobals kg = kernel_globals;
268
269 #ifdef WITH_OSL
270                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
271 #endif
272
273 #ifdef WITH_OPTIMIZED_KERNEL
274                 if(system_cpu_support_sse3()) {
275                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
276                                 kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
277
278                                 if(task_pool.cancelled())
279                                         break;
280                         }
281                 }
282                 else if(system_cpu_support_sse2()) {
283                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
284                                 kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
285
286                                 if(task_pool.cancelled())
287                                         break;
288                         }
289                 }
290                 else
291 #endif
292                 {
293                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
294                                 kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
295
296                                 if(task_pool.cancelled())
297                                         break;
298                         }
299                 }
300
301 #ifdef WITH_OSL
302                 OSLShader::thread_free(&kg);
303 #endif
304         }
305
306         void task_add(DeviceTask& task)
307         {
308                 /* split task into smaller ones */
309                 list<DeviceTask> tasks;
310                 task.split(tasks, TaskScheduler::num_threads());
311
312                 foreach(DeviceTask& task, tasks)
313                         task_pool.push(new CPUDeviceTask(this, task));
314         }
315
316         void task_wait()
317         {
318                 task_pool.wait_work();
319         }
320
321         void task_cancel()
322         {
323                 task_pool.cancel();
324         }
325 };
326
327 Device *device_cpu_create(DeviceInfo& info, Stats &stats)
328 {
329         return new CPUDevice(stats);
330 }
331
332 void device_cpu_info(vector<DeviceInfo>& devices)
333 {
334         DeviceInfo info;
335
336         info.type = DEVICE_CPU;
337         info.description = system_cpu_brand_string();
338         info.id = "CPU";
339         info.num = 0;
340         info.advanced_shading = true;
341         info.pack_images = false;
342
343         devices.insert(devices.begin(), info);
344 }
345
346 CCL_NAMESPACE_END
347