Cycles: Reduce amount of malloc() calls from the kernel
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device.h"
28 #include "device_intern.h"
29
30 #include "kernel.h"
31 #include "kernel_compat_cpu.h"
32 #include "kernel_types.h"
33 #include "kernel_globals.h"
34
35 #include "osl_shader.h"
36 #include "osl_globals.h"
37
38 #include "buffers.h"
39
40 #include "util_debug.h"
41 #include "util_foreach.h"
42 #include "util_function.h"
43 #include "util_logging.h"
44 #include "util_opengl.h"
45 #include "util_progress.h"
46 #include "util_system.h"
47 #include "util_thread.h"
48
49 CCL_NAMESPACE_BEGIN
50
51 class CPUDevice : public Device
52 {
53 public:
54         TaskPool task_pool;
55         KernelGlobals kernel_globals;
56
57 #ifdef WITH_OSL
58         OSLGlobals osl_globals;
59 #endif
60         
61         CPUDevice(DeviceInfo& info, Stats &stats, bool background)
62         : Device(info, stats, background)
63         {
64 #ifdef WITH_OSL
65                 kernel_globals.osl = &osl_globals;
66 #endif
67
68                 /* do now to avoid thread issues */
69                 system_cpu_support_sse2();
70                 system_cpu_support_sse3();
71                 system_cpu_support_sse41();
72                 system_cpu_support_avx();
73                 system_cpu_support_avx2();
74
75 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
76                 if(system_cpu_support_avx2()) {
77                         VLOG(1) << "Will be using AVX2 kernels.";
78                 }
79                 else
80 #endif
81 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
82                 if(system_cpu_support_avx()) {
83                         VLOG(1) << "Will be using AVX kernels.";
84                 }
85                 else
86 #endif
87 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
88                 if(system_cpu_support_sse41()) {
89                         VLOG(1) << "Will be using SSE4.1 kernels.";
90                 }
91                 else
92 #endif
93 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
94                 if(system_cpu_support_sse3()) {
95                         VLOG(1) << "Will be using SSE3kernels.";
96                 }
97                 else
98 #endif
99 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
100                 if(system_cpu_support_sse2()) {
101                         VLOG(1) << "Will be using SSE2 kernels.";
102                 }
103                 else
104 #endif
105                 {
106                         VLOG(1) << "Will be using regular kernels.";
107                 }
108         }
109
110         ~CPUDevice()
111         {
112                 task_pool.stop();
113         }
114
115         void mem_alloc(device_memory& mem, MemoryType /*type*/)
116         {
117                 mem.device_pointer = mem.data_pointer;
118                 mem.device_size = mem.memory_size();
119                 stats.mem_alloc(mem.device_size);
120         }
121
122         void mem_copy_to(device_memory& /*mem*/)
123         {
124                 /* no-op */
125         }
126
127         void mem_copy_from(device_memory& /*mem*/,
128                            int /*y*/, int /*w*/, int /*h*/,
129                            int /*elem*/)
130         {
131                 /* no-op */
132         }
133
134         void mem_zero(device_memory& mem)
135         {
136                 memset((void*)mem.device_pointer, 0, mem.memory_size());
137         }
138
139         void mem_free(device_memory& mem)
140         {
141                 if(mem.device_pointer) {
142                         mem.device_pointer = 0;
143                         stats.mem_free(mem.device_size);
144                         mem.device_size = 0;
145                 }
146         }
147
148         void const_copy_to(const char *name, void *host, size_t size)
149         {
150                 kernel_const_copy(&kernel_globals, name, host, size);
151         }
152
153         void tex_alloc(const char *name,
154                        device_memory& mem,
155                        InterpolationType interpolation,
156                        ExtensionType extension)
157         {
158                 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
159                 kernel_tex_copy(&kernel_globals,
160                                 name,
161                                 mem.data_pointer,
162                                 mem.data_width,
163                                 mem.data_height,
164                                 mem.data_depth,
165                                 interpolation,
166                                 extension);
167                 mem.device_pointer = mem.data_pointer;
168                 mem.device_size = mem.memory_size();
169                 stats.mem_alloc(mem.device_size);
170         }
171
172         void tex_free(device_memory& mem)
173         {
174                 if(mem.device_pointer) {
175                         mem.device_pointer = 0;
176                         stats.mem_free(mem.device_size);
177                         mem.device_size = 0;
178                 }
179         }
180
181         void *osl_memory()
182         {
183 #ifdef WITH_OSL
184                 return &osl_globals;
185 #else
186                 return NULL;
187 #endif
188         }
189
190         void thread_run(DeviceTask *task)
191         {
192                 if(task->type == DeviceTask::PATH_TRACE)
193                         thread_path_trace(*task);
194                 else if(task->type == DeviceTask::FILM_CONVERT)
195                         thread_film_convert(*task);
196                 else if(task->type == DeviceTask::SHADER)
197                         thread_shader(*task);
198         }
199
200         class CPUDeviceTask : public DeviceTask {
201         public:
202                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
203                 : DeviceTask(task)
204                 {
205                         run = function_bind(&CPUDevice::thread_run, device, this);
206                 }
207         };
208
209         void thread_path_trace(DeviceTask& task)
210         {
211                 if(task_pool.canceled()) {
212                         if(task.need_finish_queue == false)
213                                 return;
214                 }
215
216                 KernelGlobals kg = thread_kernel_globals_init();
217                 RenderTile tile;
218
219                 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
220
221 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
222                 if(system_cpu_support_avx2()) {
223                         path_trace_kernel = kernel_cpu_avx2_path_trace;
224                 }
225                 else
226 #endif
227 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
228                 if(system_cpu_support_avx()) {
229                         path_trace_kernel = kernel_cpu_avx_path_trace;
230                 }
231                 else
232 #endif
233 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
234                 if(system_cpu_support_sse41()) {
235                         path_trace_kernel = kernel_cpu_sse41_path_trace;
236                 }
237                 else
238 #endif
239 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
240                 if(system_cpu_support_sse3()) {
241                         path_trace_kernel = kernel_cpu_sse3_path_trace;
242                 }
243                 else
244 #endif
245 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
246                 if(system_cpu_support_sse2()) {
247                         path_trace_kernel = kernel_cpu_sse2_path_trace;
248                 }
249                 else
250 #endif
251                 {
252                         path_trace_kernel = kernel_cpu_path_trace;
253                 }
254                 
255                 while(task.acquire_tile(this, tile)) {
256                         float *render_buffer = (float*)tile.buffer;
257                         uint *rng_state = (uint*)tile.rng_state;
258                         int start_sample = tile.start_sample;
259                         int end_sample = tile.start_sample + tile.num_samples;
260
261                         for(int sample = start_sample; sample < end_sample; sample++) {
262                                 if(task.get_cancel() || task_pool.canceled()) {
263                                         if(task.need_finish_queue == false)
264                                                 break;
265                                 }
266
267                                 for(int y = tile.y; y < tile.y + tile.h; y++) {
268                                         for(int x = tile.x; x < tile.x + tile.w; x++) {
269                                                 path_trace_kernel(&kg, render_buffer, rng_state,
270                                                                   sample, x, y, tile.offset, tile.stride);
271                                         }
272                                 }
273
274                                 tile.sample = sample + 1;
275
276                                 task.update_progress(&tile);
277                         }
278
279                         task.release_tile(tile);
280
281                         if(task_pool.canceled()) {
282                                 if(task.need_finish_queue == false)
283                                         break;
284                         }
285                 }
286
287                 thread_kernel_globals_free(&kg);
288         }
289
290         void thread_film_convert(DeviceTask& task)
291         {
292                 float sample_scale = 1.0f/(task.sample + 1);
293
294                 if(task.rgba_half) {
295                         void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
296 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
297                         if(system_cpu_support_avx2()) {
298                                 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
299                         }
300                         else
301 #endif
302 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
303                         if(system_cpu_support_avx()) {
304                                 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
305                         }
306                         else
307 #endif  
308 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
309                         if(system_cpu_support_sse41()) {
310                                 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
311                         }
312                         else
313 #endif          
314 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3                
315                         if(system_cpu_support_sse3()) {
316                                 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
317                         }
318                         else
319 #endif
320 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
321                         if(system_cpu_support_sse2()) {
322                                 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
323                         }
324                         else
325 #endif
326                         {
327                                 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
328                         }
329
330                         for(int y = task.y; y < task.y + task.h; y++)
331                                 for(int x = task.x; x < task.x + task.w; x++)
332                                         convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
333                                                 sample_scale, x, y, task.offset, task.stride);
334                 }
335                 else {
336                         void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
337 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
338                         if(system_cpu_support_avx2()) {
339                                 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
340                         }
341                         else
342 #endif
343 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
344                         if(system_cpu_support_avx()) {
345                                 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
346                         }
347                         else
348 #endif          
349 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
350                         if(system_cpu_support_sse41()) {
351                                 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
352                         }
353                         else
354 #endif                  
355 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
356                         if(system_cpu_support_sse3()) {
357                                 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
358                         }
359                         else
360 #endif
361 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
362                         if(system_cpu_support_sse2()) {
363                                 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
364                         }
365                         else
366 #endif
367                         {
368                                 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
369                         }
370
371                         for(int y = task.y; y < task.y + task.h; y++)
372                                 for(int x = task.x; x < task.x + task.w; x++)
373                                         convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
374                                                 sample_scale, x, y, task.offset, task.stride);
375
376                 }
377         }
378
379         void thread_shader(DeviceTask& task)
380         {
381                 KernelGlobals kg = kernel_globals;
382
383 #ifdef WITH_OSL
384                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
385 #endif
386                 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
387
388 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
389                 if(system_cpu_support_avx2()) {
390                         shader_kernel = kernel_cpu_avx2_shader;
391                 }
392                 else
393 #endif
394 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
395                 if(system_cpu_support_avx()) {
396                         shader_kernel = kernel_cpu_avx_shader;
397                 }
398                 else
399 #endif
400 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
401                 if(system_cpu_support_sse41()) {
402                         shader_kernel = kernel_cpu_sse41_shader;
403                 }
404                 else
405 #endif
406 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
407                 if(system_cpu_support_sse3()) {
408                         shader_kernel = kernel_cpu_sse3_shader;
409                 }
410                 else
411 #endif
412 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
413                 if(system_cpu_support_sse2()) {
414                         shader_kernel = kernel_cpu_sse2_shader;
415                 }
416                 else
417 #endif
418                 {
419                         shader_kernel = kernel_cpu_shader;
420                 }
421
422                 for(int sample = 0; sample < task.num_samples; sample++) {
423                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
424                                 shader_kernel(&kg,
425                                               (uint4*)task.shader_input,
426                                               (float4*)task.shader_output,
427                                               (float*)task.shader_output_luma,
428                                               task.shader_eval_type,
429                                               task.shader_filter,
430                                               x,
431                                               task.offset,
432                                               sample);
433
434                         if(task.get_cancel() || task_pool.canceled())
435                                 break;
436
437                         task.update_progress(NULL);
438
439                 }
440
441 #ifdef WITH_OSL
442                 OSLShader::thread_free(&kg);
443 #endif
444         }
445
446         int get_split_task_count(DeviceTask& task)
447         {
448                 if(task.type == DeviceTask::SHADER)
449                         return task.get_subtask_count(TaskScheduler::num_threads(), 256);
450                 else
451                         return task.get_subtask_count(TaskScheduler::num_threads());
452         }
453
454         void task_add(DeviceTask& task)
455         {
456                 /* split task into smaller ones */
457                 list<DeviceTask> tasks;
458
459                 if(task.type == DeviceTask::SHADER)
460                         task.split(tasks, TaskScheduler::num_threads(), 256);
461                 else
462                         task.split(tasks, TaskScheduler::num_threads());
463
464                 foreach(DeviceTask& task, tasks)
465                         task_pool.push(new CPUDeviceTask(this, task));
466         }
467
468         void task_wait()
469         {
470                 task_pool.wait_work();
471         }
472
473         void task_cancel()
474         {
475                 task_pool.cancel();
476         }
477
478 protected:
479         inline KernelGlobals thread_kernel_globals_init()
480         {
481                 KernelGlobals kg = kernel_globals;
482                 kg.transparent_shadow_intersections = NULL;
483                 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
484                                             sizeof(*kg.decoupled_volume_steps);
485                 for(int i = 0; i < decoupled_count; ++i) {
486                         kg.decoupled_volume_steps[i] = NULL;
487                 }
488                 kg.decoupled_volume_steps_index = 0;
489 #ifdef WITH_OSL
490                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
491 #endif
492                 return kg;
493         }
494
495         inline void thread_kernel_globals_free(KernelGlobals *kg)
496         {
497                 if(kg->transparent_shadow_intersections != NULL) {
498                         free(kg->transparent_shadow_intersections);
499                 }
500                 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
501                                             sizeof(*kg->decoupled_volume_steps);
502                 for(int i = 0; i < decoupled_count; ++i) {
503                         if(kg->decoupled_volume_steps[i] != NULL) {
504                                 free(kg->decoupled_volume_steps[i]);
505                         }
506                 }
507 #ifdef WITH_OSL
508                 OSLShader::thread_free(kg);
509 #endif
510         }
511 };
512
513 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
514 {
515         return new CPUDevice(info, stats, background);
516 }
517
518 void device_cpu_info(vector<DeviceInfo>& devices)
519 {
520         DeviceInfo info;
521
522         info.type = DEVICE_CPU;
523         info.description = system_cpu_brand_string();
524         info.id = "CPU";
525         info.num = 0;
526         info.advanced_shading = true;
527         info.pack_images = false;
528
529         devices.insert(devices.begin(), info);
530 }
531
532 string device_cpu_capabilities(void)
533 {
534         string capabilities = "";
535         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
536         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
537         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
538         capabilities += system_cpu_support_avx() ? "AVX " : "";
539         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
540         if(capabilities[capabilities.size() - 1] == ' ')
541                 capabilities.resize(capabilities.size() - 1);
542         return capabilities;
543 }
544
545 CCL_NAMESPACE_END