Cycles: Code cleanup, spaces around keywords
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 #  include <OSL/oslexec.h>
23 #endif
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "kernel.h"
29 #include "kernel_compat_cpu.h"
30 #include "kernel_types.h"
31 #include "kernel_globals.h"
32
33 #include "osl_shader.h"
34 #include "osl_globals.h"
35
36 #include "buffers.h"
37
38 #include "util_debug.h"
39 #include "util_foreach.h"
40 #include "util_function.h"
41 #include "util_opengl.h"
42 #include "util_progress.h"
43 #include "util_system.h"
44 #include "util_thread.h"
45
46 CCL_NAMESPACE_BEGIN
47
48 class CPUDevice : public Device
49 {
50 public:
51         TaskPool task_pool;
52         KernelGlobals kernel_globals;
53
54 #ifdef WITH_OSL
55         OSLGlobals osl_globals;
56 #endif
57         
58         CPUDevice(DeviceInfo& info, Stats &stats, bool background)
59         : Device(info, stats, background)
60         {
61 #ifdef WITH_OSL
62                 kernel_globals.osl = &osl_globals;
63 #endif
64
65                 /* do now to avoid thread issues */
66                 system_cpu_support_sse2();
67                 system_cpu_support_sse3();
68                 system_cpu_support_sse41();
69                 system_cpu_support_avx();
70                 system_cpu_support_avx2();
71         }
72
73         ~CPUDevice()
74         {
75                 task_pool.stop();
76         }
77
78         void mem_alloc(device_memory& mem, MemoryType /*type*/)
79         {
80                 mem.device_pointer = mem.data_pointer;
81                 mem.device_size = mem.memory_size();
82                 stats.mem_alloc(mem.device_size);
83         }
84
85         void mem_copy_to(device_memory& /*mem*/)
86         {
87                 /* no-op */
88         }
89
90         void mem_copy_from(device_memory& /*mem*/,
91                            int /*y*/, int /*w*/, int /*h*/,
92                            int /*elem*/)
93         {
94                 /* no-op */
95         }
96
97         void mem_zero(device_memory& mem)
98         {
99                 memset((void*)mem.device_pointer, 0, mem.memory_size());
100         }
101
102         void mem_free(device_memory& mem)
103         {
104                 if(mem.device_pointer) {
105                         mem.device_pointer = 0;
106                         stats.mem_free(mem.device_size);
107                         mem.device_size = 0;
108                 }
109         }
110
111         void const_copy_to(const char *name, void *host, size_t size)
112         {
113                 kernel_const_copy(&kernel_globals, name, host, size);
114         }
115
116         void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool /*periodic*/)
117         {
118                 kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
119                 mem.device_pointer = mem.data_pointer;
120                 mem.device_size = mem.memory_size();
121                 stats.mem_alloc(mem.device_size);
122         }
123
124         void tex_free(device_memory& mem)
125         {
126                 if(mem.device_pointer) {
127                         mem.device_pointer = 0;
128                         stats.mem_free(mem.device_size);
129                         mem.device_size = 0;
130                 }
131         }
132
133         void *osl_memory()
134         {
135 #ifdef WITH_OSL
136                 return &osl_globals;
137 #else
138                 return NULL;
139 #endif
140         }
141
142         void thread_run(DeviceTask *task)
143         {
144                 if(task->type == DeviceTask::PATH_TRACE)
145                         thread_path_trace(*task);
146                 else if(task->type == DeviceTask::FILM_CONVERT)
147                         thread_film_convert(*task);
148                 else if(task->type == DeviceTask::SHADER)
149                         thread_shader(*task);
150         }
151
152         class CPUDeviceTask : public DeviceTask {
153         public:
154                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
155                 : DeviceTask(task)
156                 {
157                         run = function_bind(&CPUDevice::thread_run, device, this);
158                 }
159         };
160
161         void thread_path_trace(DeviceTask& task)
162         {
163                 if(task_pool.canceled()) {
164                         if(task.need_finish_queue == false)
165                                 return;
166                 }
167
168                 KernelGlobals kg = kernel_globals;
169
170 #ifdef WITH_OSL
171                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
172 #endif
173
174                 RenderTile tile;
175
176                 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
177
178 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
179                 if(system_cpu_support_avx2())
180                         path_trace_kernel = kernel_cpu_avx2_path_trace;
181                 else
182 #endif
183 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
184                 if(system_cpu_support_avx())
185                         path_trace_kernel = kernel_cpu_avx_path_trace;
186                 else
187 #endif
188 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
189                 if(system_cpu_support_sse41())
190                         path_trace_kernel = kernel_cpu_sse41_path_trace;
191                 else
192 #endif
193 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
194                 if(system_cpu_support_sse3())
195                         path_trace_kernel = kernel_cpu_sse3_path_trace;
196                 else
197 #endif
198 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
199                 if(system_cpu_support_sse2())
200                         path_trace_kernel = kernel_cpu_sse2_path_trace;
201                 else
202 #endif
203                         path_trace_kernel = kernel_cpu_path_trace;
204                 
205                 while(task.acquire_tile(this, tile)) {
206                         float *render_buffer = (float*)tile.buffer;
207                         uint *rng_state = (uint*)tile.rng_state;
208                         int start_sample = tile.start_sample;
209                         int end_sample = tile.start_sample + tile.num_samples;
210
211                         for(int sample = start_sample; sample < end_sample; sample++) {
212                                 if(task.get_cancel() || task_pool.canceled()) {
213                                         if(task.need_finish_queue == false)
214                                                 break;
215                                 }
216
217                                 for(int y = tile.y; y < tile.y + tile.h; y++) {
218                                         for(int x = tile.x; x < tile.x + tile.w; x++) {
219                                                 path_trace_kernel(&kg, render_buffer, rng_state,
220                                                                   sample, x, y, tile.offset, tile.stride);
221                                         }
222                                 }
223
224                                 tile.sample = sample + 1;
225
226                                 task.update_progress(&tile);
227                         }
228
229                         task.release_tile(tile);
230
231                         if(task_pool.canceled()) {
232                                 if(task.need_finish_queue == false)
233                                         break;
234                         }
235                 }
236
237 #ifdef WITH_OSL
238                 OSLShader::thread_free(&kg);
239 #endif
240         }
241
242         void thread_film_convert(DeviceTask& task)
243         {
244                 float sample_scale = 1.0f/(task.sample + 1);
245
246                 if(task.rgba_half) {
247                         void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
248 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
249                         if(system_cpu_support_avx2())
250                                 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
251                         else
252 #endif
253 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
254                         if(system_cpu_support_avx())
255                                 for(int y = task.y; y < task.y + task.h; y++)
256                                 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
257                         else
258 #endif  
259 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
260                         if(system_cpu_support_sse41())
261                                 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
262                         else
263 #endif          
264 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3                
265                         if(system_cpu_support_sse3())
266                                 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
267                         else
268 #endif
269 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
270                         if(system_cpu_support_sse2())
271                                 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
272                         else
273 #endif
274                                 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
275
276                         for(int y = task.y; y < task.y + task.h; y++)
277                                 for(int x = task.x; x < task.x + task.w; x++)
278                                         convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
279                                                 sample_scale, x, y, task.offset, task.stride);
280                 }
281                 else {
282                         void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
283 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
284                         if(system_cpu_support_avx2())
285                                 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
286                         else
287 #endif
288 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
289                         if(system_cpu_support_avx())
290                                 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
291                         else
292 #endif          
293 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
294                         if(system_cpu_support_sse41())
295                                 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
296                         else
297 #endif                  
298 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
299                         if(system_cpu_support_sse3())
300                                 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
301                         else
302 #endif
303 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
304                         if(system_cpu_support_sse2())
305                                 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
306                         else
307 #endif
308                                 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
309
310                         for(int y = task.y; y < task.y + task.h; y++)
311                                 for(int x = task.x; x < task.x + task.w; x++)
312                                         convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
313                                                 sample_scale, x, y, task.offset, task.stride);
314
315                 }
316         }
317
318         void thread_shader(DeviceTask& task)
319         {
320                 KernelGlobals kg = kernel_globals;
321
322 #ifdef WITH_OSL
323                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
324 #endif
325                 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int);
326
327 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
328                 if(system_cpu_support_avx2())
329                         shader_kernel = kernel_cpu_avx2_shader;
330                 else
331 #endif
332 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
333                 if(system_cpu_support_avx())
334                         shader_kernel = kernel_cpu_avx_shader;
335                 else
336 #endif
337 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
338                 if(system_cpu_support_sse41())
339                         shader_kernel = kernel_cpu_sse41_shader;
340                 else
341 #endif
342 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
343                 if(system_cpu_support_sse3())
344                         shader_kernel = kernel_cpu_sse3_shader;
345                 else
346 #endif
347 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
348                 if(system_cpu_support_sse2())
349                         shader_kernel = kernel_cpu_sse2_shader;
350                 else
351 #endif
352                         shader_kernel = kernel_cpu_shader;
353
354                 for(int sample = 0; sample < task.num_samples; sample++) {
355                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
356                                 shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
357                                         task.shader_eval_type, x, task.offset, sample);
358
359                         if(task.get_cancel() || task_pool.canceled())
360                                 break;
361
362                         task.update_progress(NULL);
363
364                 }
365
366 #ifdef WITH_OSL
367                 OSLShader::thread_free(&kg);
368 #endif
369         }
370
371         int get_split_task_count(DeviceTask& task)
372         {
373                 if(task.type == DeviceTask::SHADER)
374                         return task.get_subtask_count(TaskScheduler::num_threads(), 256);
375                 else
376                         return task.get_subtask_count(TaskScheduler::num_threads());
377         }
378
379         void task_add(DeviceTask& task)
380         {
381                 /* split task into smaller ones */
382                 list<DeviceTask> tasks;
383
384                 if(task.type == DeviceTask::SHADER)
385                         task.split(tasks, TaskScheduler::num_threads(), 256);
386                 else
387                         task.split(tasks, TaskScheduler::num_threads());
388
389                 foreach(DeviceTask& task, tasks)
390                         task_pool.push(new CPUDeviceTask(this, task));
391         }
392
393         void task_wait()
394         {
395                 task_pool.wait_work();
396         }
397
398         void task_cancel()
399         {
400                 task_pool.cancel();
401         }
402 };
403
404 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
405 {
406         return new CPUDevice(info, stats, background);
407 }
408
409 void device_cpu_info(vector<DeviceInfo>& devices)
410 {
411         DeviceInfo info;
412
413         info.type = DEVICE_CPU;
414         info.description = system_cpu_brand_string();
415         info.id = "CPU";
416         info.num = 0;
417         info.advanced_shading = true;
418         info.pack_images = false;
419
420         devices.insert(devices.begin(), info);
421 }
422
423 string device_cpu_capabilities(void)
424 {
425         string capabilities = "";
426         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
427         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
428         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
429         capabilities += system_cpu_support_avx() ? "AVX " : "";
430         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
431         if(capabilities[capabilities.size() - 1] == ' ')
432                 capabilities.resize(capabilities.size() - 1);
433         return capabilities;
434 }
435
436 CCL_NAMESPACE_END