Cycles: threading optimizations
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #include <stdlib.h>
20 #include <string.h>
21
22 #include "device.h"
23 #include "device_intern.h"
24
25 #include "kernel.h"
26 #include "kernel_types.h"
27
28 #include "osl_shader.h"
29
30 #include "util_debug.h"
31 #include "util_foreach.h"
32 #include "util_function.h"
33 #include "util_opengl.h"
34 #include "util_progress.h"
35 #include "util_system.h"
36 #include "util_thread.h"
37
38 CCL_NAMESPACE_BEGIN
39
40 class CPUDevice : public Device
41 {
42 public:
43         TaskPool task_pool;
44         KernelGlobals *kg;
45         
46         CPUDevice(int threads_num)
47         {
48                 kg = kernel_globals_create();
49
50                 /* do now to avoid thread issues */
51                 system_cpu_support_optimized();
52         }
53
54         ~CPUDevice()
55         {
56                 task_pool.stop();
57                 kernel_globals_free(kg);
58         }
59
60         bool support_advanced_shading()
61         {
62                 return true;
63         }
64
65         void mem_alloc(device_memory& mem, MemoryType type)
66         {
67                 mem.device_pointer = mem.data_pointer;
68         }
69
70         void mem_copy_to(device_memory& mem)
71         {
72                 /* no-op */
73         }
74
75         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
76         {
77                 /* no-op */
78         }
79
80         void mem_zero(device_memory& mem)
81         {
82                 memset((void*)mem.device_pointer, 0, mem.memory_size());
83         }
84
85         void mem_free(device_memory& mem)
86         {
87                 mem.device_pointer = 0;
88         }
89
90         void const_copy_to(const char *name, void *host, size_t size)
91         {
92                 kernel_const_copy(kg, name, host, size);
93         }
94
95         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
96         {
97                 kernel_tex_copy(kg, name, mem.data_pointer, mem.data_width, mem.data_height);
98                 mem.device_pointer = mem.data_pointer;
99         }
100
101         void tex_free(device_memory& mem)
102         {
103                 mem.device_pointer = 0;
104         }
105
106         void *osl_memory()
107         {
108 #ifdef WITH_OSL
109                 return kernel_osl_memory(kg);
110 #else
111                 return NULL;
112 #endif
113         }
114
115         void thread_run(DeviceTask *task)
116         {
117                 if(task->type == DeviceTask::PATH_TRACE)
118                         thread_path_trace(*task);
119                 else if(task->type == DeviceTask::TONEMAP)
120                         thread_tonemap(*task);
121                 else if(task->type == DeviceTask::SHADER)
122                         thread_shader(*task);
123         }
124
125         class CPUDeviceTask : public DeviceTask {
126         public:
127                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
128                 : DeviceTask(task)
129                 {
130                         run = function_bind(&CPUDevice::thread_run, device, this);
131                 }
132         };
133
134         void thread_path_trace(DeviceTask& task)
135         {
136                 if(task_pool.cancelled())
137                         return;
138
139 #ifdef WITH_OSL
140                 if(kernel_osl_use(kg))
141                         OSLShader::thread_init(kg);
142 #endif
143
144 #ifdef WITH_OPTIMIZED_KERNEL
145                 if(system_cpu_support_optimized()) {
146                         for(int y = task.y; y < task.y + task.h; y++) {
147                                 for(int x = task.x; x < task.x + task.w; x++)
148                                         kernel_cpu_optimized_path_trace(kg, (float*)task.buffer, (unsigned int*)task.rng_state,
149                                                 task.sample, x, y, task.offset, task.stride);
150
151                                 if(task_pool.cancelled())
152                                         break;
153                         }
154                 }
155                 else
156 #endif
157                 {
158                         for(int y = task.y; y < task.y + task.h; y++) {
159                                 for(int x = task.x; x < task.x + task.w; x++)
160                                         kernel_cpu_path_trace(kg, (float*)task.buffer, (unsigned int*)task.rng_state,
161                                                 task.sample, x, y, task.offset, task.stride);
162
163                                 if(task_pool.cancelled())
164                                         break;
165                         }
166                 }
167
168 #ifdef WITH_OSL
169                 if(kernel_osl_use(kg))
170                         OSLShader::thread_free(kg);
171 #endif
172         }
173
174         void thread_tonemap(DeviceTask& task)
175         {
176 #ifdef WITH_OPTIMIZED_KERNEL
177                 if(system_cpu_support_optimized()) {
178                         for(int y = task.y; y < task.y + task.h; y++)
179                                 for(int x = task.x; x < task.x + task.w; x++)
180                                         kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float*)task.buffer,
181                                                 task.sample, task.resolution, x, y, task.offset, task.stride);
182                 }
183                 else
184 #endif
185                 {
186                         for(int y = task.y; y < task.y + task.h; y++)
187                                 for(int x = task.x; x < task.x + task.w; x++)
188                                         kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float*)task.buffer,
189                                                 task.sample, task.resolution, x, y, task.offset, task.stride);
190                 }
191         }
192
193         void thread_shader(DeviceTask& task)
194         {
195 #ifdef WITH_OSL
196                 if(kernel_osl_use(kg))
197                         OSLShader::thread_init(kg);
198 #endif
199
200 #ifdef WITH_OPTIMIZED_KERNEL
201                 if(system_cpu_support_optimized()) {
202                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
203                                 kernel_cpu_optimized_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
204
205                                 if(task_pool.cancelled())
206                                         break;
207                         }
208                 }
209                 else
210 #endif
211                 {
212                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
213                                 kernel_cpu_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
214
215                                 if(task_pool.cancelled())
216                                         break;
217                         }
218                 }
219
220 #ifdef WITH_OSL
221                 if(kernel_osl_use(kg))
222                         OSLShader::thread_free(kg);
223 #endif
224         }
225
226         void task_add(DeviceTask& task)
227         {
228                 /* split task into smaller ones, more than number of threads for uneven
229                    workloads where some parts of the image render slower than others */
230                 list<DeviceTask> tasks;
231
232                 task.split(tasks, TaskScheduler::num_threads()*10);
233
234                 foreach(DeviceTask& task, tasks)
235                         task_pool.push(new CPUDeviceTask(this, task));
236         }
237
238         void task_wait()
239         {
240                 task_pool.wait_work();
241         }
242
243         void task_cancel()
244         {
245                 task_pool.cancel();
246         }
247 };
248
249 Device *device_cpu_create(DeviceInfo& info, int threads)
250 {
251         return new CPUDevice(threads);
252 }
253
254 void device_cpu_info(vector<DeviceInfo>& devices)
255 {
256         DeviceInfo info;
257
258         info.type = DEVICE_CPU;
259         info.description = system_cpu_brand_string();
260         info.id = "CPU";
261         info.num = 0;
262         info.advanced_shading = true;
263
264         devices.insert(devices.begin(), info);
265 }
266
267 CCL_NAMESPACE_END
268