676b1279a80ce3c80212bf69e37b4dfc5e79f346
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device.h"
28 #include "device_intern.h"
29
30 #include "kernel.h"
31 #include "kernel_compat_cpu.h"
32 #include "kernel_types.h"
33 #include "kernel_globals.h"
34
35 #include "osl_shader.h"
36 #include "osl_globals.h"
37
38 #include "buffers.h"
39
40 #include "util_debug.h"
41 #include "util_foreach.h"
42 #include "util_function.h"
43 #include "util_logging.h"
44 #include "util_opengl.h"
45 #include "util_progress.h"
46 #include "util_system.h"
47 #include "util_thread.h"
48
49 CCL_NAMESPACE_BEGIN
50
51 class CPUDevice : public Device
52 {
53 public:
54         TaskPool task_pool;
55         KernelGlobals kernel_globals;
56
57 #ifdef WITH_OSL
58         OSLGlobals osl_globals;
59 #endif
60         
61         CPUDevice(DeviceInfo& info, Stats &stats, bool background)
62         : Device(info, stats, background)
63         {
64 #ifdef WITH_OSL
65                 kernel_globals.osl = &osl_globals;
66 #endif
67
68                 /* do now to avoid thread issues */
69                 system_cpu_support_sse2();
70                 system_cpu_support_sse3();
71                 system_cpu_support_sse41();
72                 system_cpu_support_avx();
73                 system_cpu_support_avx2();
74
75 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
76                 if(system_cpu_support_avx2()) {
77                         VLOG(1) << "Will be using AVX2 kernels.";
78                 }
79                 else
80 #endif
81 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
82                 if(system_cpu_support_avx()) {
83                         VLOG(1) << "Will be using AVX kernels.";
84                 }
85                 else
86 #endif
87 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
88                 if(system_cpu_support_sse41()) {
89                         VLOG(1) << "Will be using SSE4.1 kernels.";
90                 }
91                 else
92 #endif
93 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
94                 if(system_cpu_support_sse3()) {
95                         VLOG(1) << "Will be using SSE3kernels.";
96                 }
97                 else
98 #endif
99 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
100                 if(system_cpu_support_sse2()) {
101                         VLOG(1) << "Will be using SSE2 kernels.";
102                 }
103                 else
104 #endif
105                 {
106                         VLOG(1) << "Will be using regular kernels.";
107                 }
108         }
109
110         ~CPUDevice()
111         {
112                 task_pool.stop();
113         }
114
115         void mem_alloc(device_memory& mem, MemoryType /*type*/)
116         {
117                 mem.device_pointer = mem.data_pointer;
118                 mem.device_size = mem.memory_size();
119                 stats.mem_alloc(mem.device_size);
120         }
121
122         void mem_copy_to(device_memory& /*mem*/)
123         {
124                 /* no-op */
125         }
126
127         void mem_copy_from(device_memory& /*mem*/,
128                            int /*y*/, int /*w*/, int /*h*/,
129                            int /*elem*/)
130         {
131                 /* no-op */
132         }
133
134         void mem_zero(device_memory& mem)
135         {
136                 memset((void*)mem.device_pointer, 0, mem.memory_size());
137         }
138
139         void mem_free(device_memory& mem)
140         {
141                 if(mem.device_pointer) {
142                         mem.device_pointer = 0;
143                         stats.mem_free(mem.device_size);
144                         mem.device_size = 0;
145                 }
146         }
147
148         void const_copy_to(const char *name, void *host, size_t size)
149         {
150                 kernel_const_copy(&kernel_globals, name, host, size);
151         }
152
153         void tex_alloc(const char *name,
154                        device_memory& mem,
155                        InterpolationType interpolation,
156                        ExtensionType extension)
157         {
158                 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
159                 kernel_tex_copy(&kernel_globals,
160                                 name,
161                                 mem.data_pointer,
162                                 mem.data_width,
163                                 mem.data_height,
164                                 mem.data_depth,
165                                 interpolation,
166                                 extension);
167                 mem.device_pointer = mem.data_pointer;
168                 mem.device_size = mem.memory_size();
169                 stats.mem_alloc(mem.device_size);
170         }
171
172         void tex_free(device_memory& mem)
173         {
174                 if(mem.device_pointer) {
175                         mem.device_pointer = 0;
176                         stats.mem_free(mem.device_size);
177                         mem.device_size = 0;
178                 }
179         }
180
181         void *osl_memory()
182         {
183 #ifdef WITH_OSL
184                 return &osl_globals;
185 #else
186                 return NULL;
187 #endif
188         }
189
190         void thread_run(DeviceTask *task)
191         {
192                 if(task->type == DeviceTask::PATH_TRACE)
193                         thread_path_trace(*task);
194                 else if(task->type == DeviceTask::FILM_CONVERT)
195                         thread_film_convert(*task);
196                 else if(task->type == DeviceTask::SHADER)
197                         thread_shader(*task);
198         }
199
200         class CPUDeviceTask : public DeviceTask {
201         public:
202                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
203                 : DeviceTask(task)
204                 {
205                         run = function_bind(&CPUDevice::thread_run, device, this);
206                 }
207         };
208
209         void thread_path_trace(DeviceTask& task)
210         {
211                 if(task_pool.canceled()) {
212                         if(task.need_finish_queue == false)
213                                 return;
214                 }
215
216                 KernelGlobals kg = kernel_globals;
217
218 #ifdef WITH_OSL
219                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
220 #endif
221
222                 RenderTile tile;
223
224                 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
225
226 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
227                 if(system_cpu_support_avx2()) {
228                         path_trace_kernel = kernel_cpu_avx2_path_trace;
229                 }
230                 else
231 #endif
232 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
233                 if(system_cpu_support_avx()) {
234                         path_trace_kernel = kernel_cpu_avx_path_trace;
235                 }
236                 else
237 #endif
238 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
239                 if(system_cpu_support_sse41()) {
240                         path_trace_kernel = kernel_cpu_sse41_path_trace;
241                 }
242                 else
243 #endif
244 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
245                 if(system_cpu_support_sse3()) {
246                         path_trace_kernel = kernel_cpu_sse3_path_trace;
247                 }
248                 else
249 #endif
250 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
251                 if(system_cpu_support_sse2()) {
252                         path_trace_kernel = kernel_cpu_sse2_path_trace;
253                 }
254                 else
255 #endif
256                 {
257                         path_trace_kernel = kernel_cpu_path_trace;
258                 }
259                 
260                 while(task.acquire_tile(this, tile)) {
261                         float *render_buffer = (float*)tile.buffer;
262                         uint *rng_state = (uint*)tile.rng_state;
263                         int start_sample = tile.start_sample;
264                         int end_sample = tile.start_sample + tile.num_samples;
265
266                         for(int sample = start_sample; sample < end_sample; sample++) {
267                                 if(task.get_cancel() || task_pool.canceled()) {
268                                         if(task.need_finish_queue == false)
269                                                 break;
270                                 }
271
272                                 for(int y = tile.y; y < tile.y + tile.h; y++) {
273                                         for(int x = tile.x; x < tile.x + tile.w; x++) {
274                                                 path_trace_kernel(&kg, render_buffer, rng_state,
275                                                                   sample, x, y, tile.offset, tile.stride);
276                                         }
277                                 }
278
279                                 tile.sample = sample + 1;
280
281                                 task.update_progress(&tile);
282                         }
283
284                         task.release_tile(tile);
285
286                         if(task_pool.canceled()) {
287                                 if(task.need_finish_queue == false)
288                                         break;
289                         }
290                 }
291
292 #ifdef WITH_OSL
293                 OSLShader::thread_free(&kg);
294 #endif
295         }
296
297         void thread_film_convert(DeviceTask& task)
298         {
299                 float sample_scale = 1.0f/(task.sample + 1);
300
301                 if(task.rgba_half) {
302                         void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
303 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
304                         if(system_cpu_support_avx2()) {
305                                 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
306                         }
307                         else
308 #endif
309 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
310                         if(system_cpu_support_avx()) {
311                                 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
312                         }
313                         else
314 #endif  
315 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
316                         if(system_cpu_support_sse41()) {
317                                 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
318                         }
319                         else
320 #endif          
321 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3                
322                         if(system_cpu_support_sse3()) {
323                                 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
324                         }
325                         else
326 #endif
327 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
328                         if(system_cpu_support_sse2()) {
329                                 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
330                         }
331                         else
332 #endif
333                         {
334                                 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
335                         }
336
337                         for(int y = task.y; y < task.y + task.h; y++)
338                                 for(int x = task.x; x < task.x + task.w; x++)
339                                         convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
340                                                 sample_scale, x, y, task.offset, task.stride);
341                 }
342                 else {
343                         void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
344 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
345                         if(system_cpu_support_avx2()) {
346                                 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
347                         }
348                         else
349 #endif
350 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
351                         if(system_cpu_support_avx()) {
352                                 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
353                         }
354                         else
355 #endif          
356 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
357                         if(system_cpu_support_sse41()) {
358                                 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
359                         }
360                         else
361 #endif                  
362 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
363                         if(system_cpu_support_sse3()) {
364                                 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
365                         }
366                         else
367 #endif
368 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
369                         if(system_cpu_support_sse2()) {
370                                 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
371                         }
372                         else
373 #endif
374                         {
375                                 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
376                         }
377
378                         for(int y = task.y; y < task.y + task.h; y++)
379                                 for(int x = task.x; x < task.x + task.w; x++)
380                                         convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
381                                                 sample_scale, x, y, task.offset, task.stride);
382
383                 }
384         }
385
386         void thread_shader(DeviceTask& task)
387         {
388                 KernelGlobals kg = kernel_globals;
389
390 #ifdef WITH_OSL
391                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
392 #endif
393                 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
394
395 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
396                 if(system_cpu_support_avx2()) {
397                         shader_kernel = kernel_cpu_avx2_shader;
398                 }
399                 else
400 #endif
401 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
402                 if(system_cpu_support_avx()) {
403                         shader_kernel = kernel_cpu_avx_shader;
404                 }
405                 else
406 #endif
407 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
408                 if(system_cpu_support_sse41()) {
409                         shader_kernel = kernel_cpu_sse41_shader;
410                 }
411                 else
412 #endif
413 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
414                 if(system_cpu_support_sse3()) {
415                         shader_kernel = kernel_cpu_sse3_shader;
416                 }
417                 else
418 #endif
419 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
420                 if(system_cpu_support_sse2()) {
421                         shader_kernel = kernel_cpu_sse2_shader;
422                 }
423                 else
424 #endif
425                 {
426                         shader_kernel = kernel_cpu_shader;
427                 }
428
429                 for(int sample = 0; sample < task.num_samples; sample++) {
430                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
431                                 shader_kernel(&kg,
432                                               (uint4*)task.shader_input,
433                                               (float4*)task.shader_output,
434                                               (float*)task.shader_output_luma,
435                                               task.shader_eval_type,
436                                               task.shader_filter,
437                                               x,
438                                               task.offset,
439                                               sample);
440
441                         if(task.get_cancel() || task_pool.canceled())
442                                 break;
443
444                         task.update_progress(NULL);
445
446                 }
447
448 #ifdef WITH_OSL
449                 OSLShader::thread_free(&kg);
450 #endif
451         }
452
453         int get_split_task_count(DeviceTask& task)
454         {
455                 if(task.type == DeviceTask::SHADER)
456                         return task.get_subtask_count(TaskScheduler::num_threads(), 256);
457                 else
458                         return task.get_subtask_count(TaskScheduler::num_threads());
459         }
460
461         void task_add(DeviceTask& task)
462         {
463                 /* split task into smaller ones */
464                 list<DeviceTask> tasks;
465
466                 if(task.type == DeviceTask::SHADER)
467                         task.split(tasks, TaskScheduler::num_threads(), 256);
468                 else
469                         task.split(tasks, TaskScheduler::num_threads());
470
471                 foreach(DeviceTask& task, tasks)
472                         task_pool.push(new CPUDeviceTask(this, task));
473         }
474
475         void task_wait()
476         {
477                 task_pool.wait_work();
478         }
479
480         void task_cancel()
481         {
482                 task_pool.cancel();
483         }
484 };
485
486 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
487 {
488         return new CPUDevice(info, stats, background);
489 }
490
491 void device_cpu_info(vector<DeviceInfo>& devices)
492 {
493         DeviceInfo info;
494
495         info.type = DEVICE_CPU;
496         info.description = system_cpu_brand_string();
497         info.id = "CPU";
498         info.num = 0;
499         info.advanced_shading = true;
500         info.pack_images = false;
501
502         devices.insert(devices.begin(), info);
503 }
504
505 string device_cpu_capabilities(void)
506 {
507         string capabilities = "";
508         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
509         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
510         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
511         capabilities += system_cpu_support_avx() ? "AVX " : "";
512         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
513         if(capabilities[capabilities.size() - 1] == ' ')
514                 capabilities.resize(capabilities.size() - 1);
515         return capabilities;
516 }
517
518 CCL_NAMESPACE_END