Merge branch 'master' into blender2.8
[blender.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util/util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device/device.h"
28 #include "device/device_intern.h"
29 #include "device/device_split_kernel.h"
30
31 #include "kernel/kernel.h"
32 #include "kernel/kernel_compat_cpu.h"
33 #include "kernel/kernel_types.h"
34 #include "kernel/split/kernel_split_data.h"
35 #include "kernel/kernel_globals.h"
36
37 #include "kernel/osl/osl_shader.h"
38 #include "kernel/osl/osl_globals.h"
39
40 #include "render/buffers.h"
41
42 #include "util/util_debug.h"
43 #include "util/util_foreach.h"
44 #include "util/util_function.h"
45 #include "util/util_logging.h"
46 #include "util/util_map.h"
47 #include "util/util_opengl.h"
48 #include "util/util_progress.h"
49 #include "util/util_system.h"
50 #include "util/util_thread.h"
51
52 CCL_NAMESPACE_BEGIN
53
54 class CPUDevice;
55
56 class CPUSplitKernel : public DeviceSplitKernel {
57         CPUDevice *device;
58 public:
59         explicit CPUSplitKernel(CPUDevice *device);
60
61         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
62                                                     RenderTile& rtile,
63                                                     int num_global_elements,
64                                                     device_memory& kernel_globals,
65                                                     device_memory& kernel_data_,
66                                                     device_memory& split_data,
67                                                     device_memory& ray_state,
68                                                     device_memory& queue_index,
69                                                     device_memory& use_queues_flag,
70                                                     device_memory& work_pool_wgs);
71
72         virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
73         virtual int2 split_kernel_local_size();
74         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
75         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
76 };
77
78 class CPUDevice : public Device
79 {
80         static unordered_map<string, void*> kernel_functions;
81
82         static void register_kernel_function(const char* name, void* func)
83         {
84                 kernel_functions[name] = func;
85         }
86
87         static const char* get_arch_name()
88         {
89 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
90                 if(system_cpu_support_avx2()) {
91                         return "cpu_avx2";
92                 }
93                 else
94 #endif
95 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
96                 if(system_cpu_support_avx()) {
97                         return "cpu_avx";
98                 }
99                 else
100 #endif
101 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
102                 if(system_cpu_support_sse41()) {
103                         return "cpu_sse41";
104                 }
105                 else
106 #endif
107 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
108                 if(system_cpu_support_sse3()) {
109                         return "cpu_sse3";
110                 }
111                 else
112 #endif
113 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
114                 if(system_cpu_support_sse2()) {
115                         return "cpu_sse2";
116                 }
117                 else
118 #endif
119                 {
120                         return "cpu";
121                 }
122         }
123
124         template<typename F>
125         static F get_kernel_function(string name)
126         {
127                 name = string("kernel_") + get_arch_name() + "_" + name;
128
129                 unordered_map<string, void*>::iterator it = kernel_functions.find(name);
130
131                 if(it == kernel_functions.end()) {
132                         assert(!"kernel function not found");
133                         return NULL;
134                 }
135
136                 return (F)it->second;
137         }
138
139         friend class CPUSplitKernel;
140
141 public:
142         TaskPool task_pool;
143         KernelGlobals kernel_globals;
144
145 #ifdef WITH_OSL
146         OSLGlobals osl_globals;
147 #endif
148
149         bool use_split_kernel;
150
151         DeviceRequestedFeatures requested_features;
152         
153         CPUDevice(DeviceInfo& info, Stats &stats, bool background)
154         : Device(info, stats, background)
155         {
156
157 #ifdef WITH_OSL
158                 kernel_globals.osl = &osl_globals;
159 #endif
160
161                 /* do now to avoid thread issues */
162                 system_cpu_support_sse2();
163                 system_cpu_support_sse3();
164                 system_cpu_support_sse41();
165                 system_cpu_support_avx();
166                 system_cpu_support_avx2();
167
168 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
169                 if(system_cpu_support_avx2()) {
170                         VLOG(1) << "Will be using AVX2 kernels.";
171                 }
172                 else
173 #endif
174 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
175                 if(system_cpu_support_avx()) {
176                         VLOG(1) << "Will be using AVX kernels.";
177                 }
178                 else
179 #endif
180 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
181                 if(system_cpu_support_sse41()) {
182                         VLOG(1) << "Will be using SSE4.1 kernels.";
183                 }
184                 else
185 #endif
186 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
187                 if(system_cpu_support_sse3()) {
188                         VLOG(1) << "Will be using SSE3kernels.";
189                 }
190                 else
191 #endif
192 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
193                 if(system_cpu_support_sse2()) {
194                         VLOG(1) << "Will be using SSE2 kernels.";
195                 }
196                 else
197 #endif
198                 {
199                         VLOG(1) << "Will be using regular kernels.";
200                 }
201
202                 use_split_kernel = DebugFlags().cpu.split_kernel;
203                 if(use_split_kernel) {
204                         VLOG(1) << "Will be using split kernel.";
205                 }
206
207                 kernel_cpu_register_functions(register_kernel_function);
208 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
209                 kernel_cpu_sse2_register_functions(register_kernel_function);
210 #endif
211 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
212                 kernel_cpu_sse3_register_functions(register_kernel_function);
213 #endif
214 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
215                 kernel_cpu_sse41_register_functions(register_kernel_function);
216 #endif
217 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
218                 kernel_cpu_avx_register_functions(register_kernel_function);
219 #endif
220 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
221                 kernel_cpu_avx2_register_functions(register_kernel_function);
222 #endif
223         }
224
225         ~CPUDevice()
226         {
227                 task_pool.stop();
228         }
229
230         virtual bool show_samples() const
231         {
232                 return (TaskScheduler::num_threads() == 1);
233         }
234
235         void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
236         {
237                 if(name) {
238                         VLOG(1) << "Buffer allocate: " << name << ", "
239                                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
240                                 << string_human_readable_size(mem.memory_size()) << ")";
241                 }
242
243                 mem.device_pointer = mem.data_pointer;
244
245                 if(!mem.device_pointer) {
246                         mem.device_pointer = (device_ptr)malloc(mem.memory_size());
247                 }
248
249                 mem.device_size = mem.memory_size();
250                 stats.mem_alloc(mem.device_size);
251         }
252
253         void mem_copy_to(device_memory& /*mem*/)
254         {
255                 /* no-op */
256         }
257
258         void mem_copy_from(device_memory& /*mem*/,
259                            int /*y*/, int /*w*/, int /*h*/,
260                            int /*elem*/)
261         {
262                 /* no-op */
263         }
264
265         void mem_zero(device_memory& mem)
266         {
267                 memset((void*)mem.device_pointer, 0, mem.memory_size());
268         }
269
270         void mem_free(device_memory& mem)
271         {
272                 if(mem.device_pointer) {
273                         if(!mem.data_pointer) {
274                                 free((void*)mem.device_pointer);
275                         }
276
277                         mem.device_pointer = 0;
278                         stats.mem_free(mem.device_size);
279                         mem.device_size = 0;
280                 }
281         }
282
283         void const_copy_to(const char *name, void *host, size_t size)
284         {
285                 kernel_const_copy(&kernel_globals, name, host, size);
286         }
287
288         void tex_alloc(const char *name,
289                        device_memory& mem,
290                        InterpolationType interpolation,
291                        ExtensionType extension)
292         {
293                 VLOG(1) << "Texture allocate: " << name << ", "
294                         << string_human_readable_number(mem.memory_size()) << " bytes. ("
295                         << string_human_readable_size(mem.memory_size()) << ")";
296                 kernel_tex_copy(&kernel_globals,
297                                 name,
298                                 mem.data_pointer,
299                                 mem.data_width,
300                                 mem.data_height,
301                                 mem.data_depth,
302                                 interpolation,
303                                 extension);
304                 mem.device_pointer = mem.data_pointer;
305                 mem.device_size = mem.memory_size();
306                 stats.mem_alloc(mem.device_size);
307         }
308
309         void tex_free(device_memory& mem)
310         {
311                 if(mem.device_pointer) {
312                         mem.device_pointer = 0;
313                         stats.mem_free(mem.device_size);
314                         mem.device_size = 0;
315                 }
316         }
317
318         void *osl_memory()
319         {
320 #ifdef WITH_OSL
321                 return &osl_globals;
322 #else
323                 return NULL;
324 #endif
325         }
326
327         void thread_run(DeviceTask *task)
328         {
329                 if(task->type == DeviceTask::PATH_TRACE) {
330                         if(!use_split_kernel) {
331                                 thread_path_trace(*task);
332                         }
333                         else {
334                                 thread_path_trace_split(*task);
335                         }
336                 }
337                 else if(task->type == DeviceTask::FILM_CONVERT)
338                         thread_film_convert(*task);
339                 else if(task->type == DeviceTask::SHADER)
340                         thread_shader(*task);
341         }
342
343         class CPUDeviceTask : public DeviceTask {
344         public:
345                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
346                 : DeviceTask(task)
347                 {
348                         run = function_bind(&CPUDevice::thread_run, device, this);
349                 }
350         };
351
352         void thread_path_trace(DeviceTask& task)
353         {
354                 if(task_pool.canceled()) {
355                         if(task.need_finish_queue == false)
356                                 return;
357                 }
358
359                 KernelGlobals kg = thread_kernel_globals_init();
360                 RenderTile tile;
361
362                 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
363
364 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
365                 if(system_cpu_support_avx2()) {
366                         path_trace_kernel = kernel_cpu_avx2_path_trace;
367                 }
368                 else
369 #endif
370 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
371                 if(system_cpu_support_avx()) {
372                         path_trace_kernel = kernel_cpu_avx_path_trace;
373                 }
374                 else
375 #endif
376 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
377                 if(system_cpu_support_sse41()) {
378                         path_trace_kernel = kernel_cpu_sse41_path_trace;
379                 }
380                 else
381 #endif
382 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
383                 if(system_cpu_support_sse3()) {
384                         path_trace_kernel = kernel_cpu_sse3_path_trace;
385                 }
386                 else
387 #endif
388 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
389                 if(system_cpu_support_sse2()) {
390                         path_trace_kernel = kernel_cpu_sse2_path_trace;
391                 }
392                 else
393 #endif
394                 {
395                         path_trace_kernel = kernel_cpu_path_trace;
396                 }
397
398                 while(task.acquire_tile(this, tile)) {
399                         float *render_buffer = (float*)tile.buffer;
400                         uint *rng_state = (uint*)tile.rng_state;
401                         int start_sample = tile.start_sample;
402                         int end_sample = tile.start_sample + tile.num_samples;
403
404                         for(int sample = start_sample; sample < end_sample; sample++) {
405                                 if(task.get_cancel() || task_pool.canceled()) {
406                                         if(task.need_finish_queue == false)
407                                                 break;
408                                 }
409
410                                 for(int y = tile.y; y < tile.y + tile.h; y++) {
411                                         for(int x = tile.x; x < tile.x + tile.w; x++) {
412                                                 path_trace_kernel(&kg, render_buffer, rng_state,
413                                                                   sample, x, y, tile.offset, tile.stride);
414                                         }
415                                 }
416
417                                 tile.sample = sample + 1;
418
419                                 task.update_progress(&tile, tile.w*tile.h);
420                         }
421
422                         task.release_tile(tile);
423
424                         if(task_pool.canceled()) {
425                                 if(task.need_finish_queue == false)
426                                         break;
427                         }
428                 }
429
430                 thread_kernel_globals_free(&kg);
431         }
432
433         void thread_path_trace_split(DeviceTask& task)
434         {
435                 if(task_pool.canceled()) {
436                         if(task.need_finish_queue == false)
437                                 return;
438                 }
439
440                 RenderTile tile;
441
442                 CPUSplitKernel split_kernel(this);
443
444                 /* allocate buffer for kernel globals */
445                 device_memory kgbuffer;
446                 kgbuffer.resize(sizeof(KernelGlobals));
447                 mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
448
449                 KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
450
451                 requested_features.max_closure = MAX_CLOSURE;
452                 if(!split_kernel.load_kernels(requested_features)) {
453                         thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
454                         mem_free(kgbuffer);
455
456                         return;
457                 }
458
459                 while(task.acquire_tile(this, tile)) {
460                         device_memory data;
461                         split_kernel.path_trace(&task, tile, kgbuffer, data);
462
463                         task.release_tile(tile);
464
465                         if(task_pool.canceled()) {
466                                 if(task.need_finish_queue == false)
467                                         break;
468                         }
469                 }
470
471                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
472                 mem_free(kgbuffer);
473         }
474
475         void thread_film_convert(DeviceTask& task)
476         {
477                 float sample_scale = 1.0f/(task.sample + 1);
478
479                 if(task.rgba_half) {
480                         void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
481 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
482                         if(system_cpu_support_avx2()) {
483                                 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
484                         }
485                         else
486 #endif
487 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
488                         if(system_cpu_support_avx()) {
489                                 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
490                         }
491                         else
492 #endif  
493 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
494                         if(system_cpu_support_sse41()) {
495                                 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
496                         }
497                         else
498 #endif          
499 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3                
500                         if(system_cpu_support_sse3()) {
501                                 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
502                         }
503                         else
504 #endif
505 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
506                         if(system_cpu_support_sse2()) {
507                                 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
508                         }
509                         else
510 #endif
511                         {
512                                 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
513                         }
514
515                         for(int y = task.y; y < task.y + task.h; y++)
516                                 for(int x = task.x; x < task.x + task.w; x++)
517                                         convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
518                                                 sample_scale, x, y, task.offset, task.stride);
519                 }
520                 else {
521                         void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
522 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
523                         if(system_cpu_support_avx2()) {
524                                 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
525                         }
526                         else
527 #endif
528 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
529                         if(system_cpu_support_avx()) {
530                                 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
531                         }
532                         else
533 #endif          
534 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
535                         if(system_cpu_support_sse41()) {
536                                 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
537                         }
538                         else
539 #endif                  
540 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
541                         if(system_cpu_support_sse3()) {
542                                 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
543                         }
544                         else
545 #endif
546 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
547                         if(system_cpu_support_sse2()) {
548                                 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
549                         }
550                         else
551 #endif
552                         {
553                                 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
554                         }
555
556                         for(int y = task.y; y < task.y + task.h; y++)
557                                 for(int x = task.x; x < task.x + task.w; x++)
558                                         convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
559                                                 sample_scale, x, y, task.offset, task.stride);
560
561                 }
562         }
563
564         void thread_shader(DeviceTask& task)
565         {
566                 KernelGlobals kg = kernel_globals;
567
568 #ifdef WITH_OSL
569                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
570 #endif
571                 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
572
573 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
574                 if(system_cpu_support_avx2()) {
575                         shader_kernel = kernel_cpu_avx2_shader;
576                 }
577                 else
578 #endif
579 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
580                 if(system_cpu_support_avx()) {
581                         shader_kernel = kernel_cpu_avx_shader;
582                 }
583                 else
584 #endif
585 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
586                 if(system_cpu_support_sse41()) {
587                         shader_kernel = kernel_cpu_sse41_shader;
588                 }
589                 else
590 #endif
591 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
592                 if(system_cpu_support_sse3()) {
593                         shader_kernel = kernel_cpu_sse3_shader;
594                 }
595                 else
596 #endif
597 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
598                 if(system_cpu_support_sse2()) {
599                         shader_kernel = kernel_cpu_sse2_shader;
600                 }
601                 else
602 #endif
603                 {
604                         shader_kernel = kernel_cpu_shader;
605                 }
606
607                 for(int sample = 0; sample < task.num_samples; sample++) {
608                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
609                                 shader_kernel(&kg,
610                                               (uint4*)task.shader_input,
611                                               (float4*)task.shader_output,
612                                               (float*)task.shader_output_luma,
613                                               task.shader_eval_type,
614                                               task.shader_filter,
615                                               x,
616                                               task.offset,
617                                               sample);
618
619                         if(task.get_cancel() || task_pool.canceled())
620                                 break;
621
622                         task.update_progress(NULL);
623
624                 }
625
626 #ifdef WITH_OSL
627                 OSLShader::thread_free(&kg);
628 #endif
629         }
630
631         int get_split_task_count(DeviceTask& task)
632         {
633                 if(task.type == DeviceTask::SHADER)
634                         return task.get_subtask_count(TaskScheduler::num_threads(), 256);
635                 else
636                         return task.get_subtask_count(TaskScheduler::num_threads());
637         }
638
639         void task_add(DeviceTask& task)
640         {
641                 /* split task into smaller ones */
642                 list<DeviceTask> tasks;
643
644                 if(task.type == DeviceTask::SHADER)
645                         task.split(tasks, TaskScheduler::num_threads(), 256);
646                 else
647                         task.split(tasks, TaskScheduler::num_threads());
648
649                 foreach(DeviceTask& task, tasks)
650                         task_pool.push(new CPUDeviceTask(this, task));
651         }
652
653         void task_wait()
654         {
655                 task_pool.wait_work();
656         }
657
658         void task_cancel()
659         {
660                 task_pool.cancel();
661         }
662
663 protected:
664         inline KernelGlobals thread_kernel_globals_init()
665         {
666                 KernelGlobals kg = kernel_globals;
667                 kg.transparent_shadow_intersections = NULL;
668                 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
669                                             sizeof(*kg.decoupled_volume_steps);
670                 for(int i = 0; i < decoupled_count; ++i) {
671                         kg.decoupled_volume_steps[i] = NULL;
672                 }
673                 kg.decoupled_volume_steps_index = 0;
674 #ifdef WITH_OSL
675                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
676 #endif
677                 return kg;
678         }
679
680         inline void thread_kernel_globals_free(KernelGlobals *kg)
681         {
682                 if(kg == NULL) {
683                         return;
684                 }
685
686                 if(kg->transparent_shadow_intersections != NULL) {
687                         free(kg->transparent_shadow_intersections);
688                 }
689                 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
690                                             sizeof(*kg->decoupled_volume_steps);
691                 for(int i = 0; i < decoupled_count; ++i) {
692                         if(kg->decoupled_volume_steps[i] != NULL) {
693                                 free(kg->decoupled_volume_steps[i]);
694                         }
695                 }
696 #ifdef WITH_OSL
697                 OSLShader::thread_free(kg);
698 #endif
699         }
700
701         virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
702                 requested_features = requested_features_;
703
704                 return true;
705         }
706 };
707
708 /* split kernel */
709
710 class CPUSplitKernelFunction : public SplitKernelFunction {
711 public:
712         CPUDevice* device;
713         void (*func)(KernelGlobals *kg, KernelData *data);
714
715         CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
716         ~CPUSplitKernelFunction() {}
717
718         virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
719         {
720                 if(!func) {
721                         return false;
722                 }
723
724                 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
725                 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
726
727                 for(int y = 0; y < dim.global_size[1]; y++) {
728                         for(int x = 0; x < dim.global_size[0]; x++) {
729                                 kg->global_id = make_int2(x, y);
730
731                                 func(kg, (KernelData*)data.device_pointer);
732                         }
733                 }
734
735                 return true;
736         }
737 };
738
739 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
740 {
741 }
742
743 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
744                                                     RenderTile& rtile,
745                                                     int num_global_elements,
746                                                     device_memory& kernel_globals,
747                                                     device_memory& data,
748                                                     device_memory& split_data,
749                                                     device_memory& ray_state,
750                                                     device_memory& queue_index,
751                                                     device_memory& use_queues_flags,
752                                                     device_memory& work_pool_wgs)
753 {
754         typedef void(*data_init_t)(KernelGlobals *kg,
755                                    ccl_constant KernelData *data,
756                                    ccl_global void *split_data_buffer,
757                                    int num_elements,
758                                    ccl_global char *ray_state,
759                                    ccl_global uint *rng_state,
760                                    int start_sample,
761                                    int end_sample,
762                                    int sx, int sy, int sw, int sh, int offset, int stride,
763                                    ccl_global int *Queue_index,
764                                    int queuesize,
765                                    ccl_global char *use_queues_flag,
766                                    ccl_global unsigned int *work_pool_wgs,
767                                    unsigned int num_samples,
768                                    ccl_global float *buffer);
769
770         data_init_t data_init;
771
772 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
773         if(system_cpu_support_avx2()) {
774                 data_init = kernel_cpu_avx2_data_init;
775         }
776         else
777 #endif
778 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
779         if(system_cpu_support_avx()) {
780                 data_init = kernel_cpu_avx_data_init;
781         }
782         else
783 #endif
784 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
785         if(system_cpu_support_sse41()) {
786                 data_init = kernel_cpu_sse41_data_init;
787         }
788         else
789 #endif
790 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
791         if(system_cpu_support_sse3()) {
792                 data_init = kernel_cpu_sse3_data_init;
793         }
794         else
795 #endif
796 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
797         if(system_cpu_support_sse2()) {
798                 data_init = kernel_cpu_sse2_data_init;
799         }
800         else
801 #endif
802         {
803                 data_init = kernel_cpu_data_init;
804         }
805
806         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
807         kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
808
809         for(int y = 0; y < dim.global_size[1]; y++) {
810                 for(int x = 0; x < dim.global_size[0]; x++) {
811                         kg->global_id = make_int2(x, y);
812
813                         data_init((KernelGlobals*)kernel_globals.device_pointer,
814                                   (KernelData*)data.device_pointer,
815                                   (void*)split_data.device_pointer,
816                                   num_global_elements,
817                                   (char*)ray_state.device_pointer,
818                                   (uint*)rtile.rng_state,
819                                   rtile.start_sample,
820                                   rtile.start_sample + rtile.num_samples,
821                                   rtile.x,
822                                   rtile.y,
823                                   rtile.w,
824                                   rtile.h,
825                                   rtile.offset,
826                                   rtile.stride,
827                                   (int*)queue_index.device_pointer,
828                                   dim.global_size[0] * dim.global_size[1],
829                                   (char*)use_queues_flags.device_pointer,
830                                   (uint*)work_pool_wgs.device_pointer,
831                                   rtile.num_samples,
832                                   (float*)rtile.buffer);
833                 }
834         }
835
836         return true;
837 }
838
839 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
840 {
841         CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
842
843         kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
844         if(!kernel->func) {
845                 delete kernel;
846                 return NULL;
847         }
848
849         return kernel;
850 }
851
852 int2 CPUSplitKernel::split_kernel_local_size()
853 {
854         return make_int2(1, 1);
855 }
856
857 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
858         return make_int2(1, 1);
859 }
860
861 uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
862         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
863
864         return split_data_buffer_size(kg, num_threads);
865 }
866
867 unordered_map<string, void*> CPUDevice::kernel_functions;
868
869 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
870 {
871         return new CPUDevice(info, stats, background);
872 }
873
874 void device_cpu_info(vector<DeviceInfo>& devices)
875 {
876         DeviceInfo info;
877
878         info.type = DEVICE_CPU;
879         info.description = system_cpu_brand_string();
880         info.id = "CPU";
881         info.num = 0;
882         info.advanced_shading = true;
883         info.pack_images = false;
884
885         devices.insert(devices.begin(), info);
886 }
887
888 string device_cpu_capabilities(void)
889 {
890         string capabilities = "";
891         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
892         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
893         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
894         capabilities += system_cpu_support_avx() ? "AVX " : "";
895         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
896         if(capabilities[capabilities.size() - 1] == ' ')
897                 capabilities.resize(capabilities.size() - 1);
898         return capabilities;
899 }
900
901 CCL_NAMESPACE_END