Cycles: add single program debug option for split kernel
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 #  include "util_windows.h"
24 #  include <OSL/oslexec.h>
25 #endif
26
27 #include "device.h"
28 #include "device_intern.h"
29 #include "device_split_kernel.h"
30
31 #include "kernel.h"
32 #include "kernel_compat_cpu.h"
33 #include "kernel_types.h"
34 #include "split/kernel_split_data.h"
35 #include "kernel_globals.h"
36
37 #include "osl_shader.h"
38 #include "osl_globals.h"
39
40 #include "buffers.h"
41
42 #include "util_debug.h"
43 #include "util_foreach.h"
44 #include "util_function.h"
45 #include "util_logging.h"
46 #include "util_map.h"
47 #include "util_opengl.h"
48 #include "util_progress.h"
49 #include "util_system.h"
50 #include "util_thread.h"
51
52 CCL_NAMESPACE_BEGIN
53
54 class CPUDevice;
55
56 class CPUSplitKernel : public DeviceSplitKernel {
57         CPUDevice *device;
58 public:
59         explicit CPUSplitKernel(CPUDevice *device);
60
61         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
62                                                     RenderTile& rtile,
63                                                     int num_global_elements,
64                                                     device_memory& kernel_globals,
65                                                     device_memory& kernel_data_,
66                                                     device_memory& split_data,
67                                                     device_memory& ray_state,
68                                                     device_memory& queue_index,
69                                                     device_memory& use_queues_flag,
70                                                     device_memory& work_pool_wgs);
71
72         virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
73         virtual int2 split_kernel_local_size();
74         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
75         virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
76 };
77
78 class CPUDevice : public Device
79 {
80         static unordered_map<string, void*> kernel_functions;
81
82         static void register_kernel_function(const char* name, void* func)
83         {
84                 kernel_functions[name] = func;
85         }
86
87         static const char* get_arch_name()
88         {
89 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
90                 if(system_cpu_support_avx2()) {
91                         return "cpu_avx2";
92                 }
93                 else
94 #endif
95 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
96                 if(system_cpu_support_avx()) {
97                         return "cpu_avx";
98                 }
99                 else
100 #endif
101 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
102                 if(system_cpu_support_sse41()) {
103                         return "cpu_sse41";
104                 }
105                 else
106 #endif
107 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
108                 if(system_cpu_support_sse3()) {
109                         return "cpu_sse3";
110                 }
111                 else
112 #endif
113 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
114                 if(system_cpu_support_sse2()) {
115                         return "cpu_sse2";
116                 }
117                 else
118 #endif
119                 {
120                         return "cpu";
121                 }
122         }
123
124         template<typename F>
125         static F get_kernel_function(string name)
126         {
127                 name = string("kernel_") + get_arch_name() + "_" + name;
128
129                 unordered_map<string, void*>::iterator it = kernel_functions.find(name);
130
131                 if(it == kernel_functions.end()) {
132                         assert(!"kernel function not found");
133                         return NULL;
134                 }
135
136                 return (F)it->second;
137         }
138
139         friend class CPUSplitKernel;
140
141 public:
142         TaskPool task_pool;
143         KernelGlobals kernel_globals;
144
145 #ifdef WITH_OSL
146         OSLGlobals osl_globals;
147 #endif
148
149         bool use_split_kernel;
150
151         DeviceRequestedFeatures requested_features;
152         
153         CPUDevice(DeviceInfo& info, Stats &stats, bool background)
154         : Device(info, stats, background)
155         {
156
157 #ifdef WITH_OSL
158                 kernel_globals.osl = &osl_globals;
159 #endif
160
161                 /* do now to avoid thread issues */
162                 system_cpu_support_sse2();
163                 system_cpu_support_sse3();
164                 system_cpu_support_sse41();
165                 system_cpu_support_avx();
166                 system_cpu_support_avx2();
167
168 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
169                 if(system_cpu_support_avx2()) {
170                         VLOG(1) << "Will be using AVX2 kernels.";
171                 }
172                 else
173 #endif
174 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
175                 if(system_cpu_support_avx()) {
176                         VLOG(1) << "Will be using AVX kernels.";
177                 }
178                 else
179 #endif
180 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
181                 if(system_cpu_support_sse41()) {
182                         VLOG(1) << "Will be using SSE4.1 kernels.";
183                 }
184                 else
185 #endif
186 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
187                 if(system_cpu_support_sse3()) {
188                         VLOG(1) << "Will be using SSE3kernels.";
189                 }
190                 else
191 #endif
192 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
193                 if(system_cpu_support_sse2()) {
194                         VLOG(1) << "Will be using SSE2 kernels.";
195                 }
196                 else
197 #endif
198                 {
199                         VLOG(1) << "Will be using regular kernels.";
200                 }
201
202                 use_split_kernel = DebugFlags().cpu.split_kernel;
203                 if(use_split_kernel) {
204                         VLOG(1) << "Will be using split kernel.";
205                 }
206
207                 kernel_cpu_register_functions(register_kernel_function);
208 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
209                 kernel_cpu_sse2_register_functions(register_kernel_function);
210 #endif
211 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
212                 kernel_cpu_sse3_register_functions(register_kernel_function);
213 #endif
214 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
215                 kernel_cpu_sse41_register_functions(register_kernel_function);
216 #endif
217 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
218                 kernel_cpu_avx_register_functions(register_kernel_function);
219 #endif
220 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
221                 kernel_cpu_avx2_register_functions(register_kernel_function);
222 #endif
223         }
224
225         ~CPUDevice()
226         {
227                 task_pool.stop();
228         }
229
230         virtual bool show_samples() const
231         {
232                 return (TaskScheduler::num_threads() == 1);
233         }
234
235         void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
236         {
237                 if(name) {
238                         VLOG(1) << "Buffer allocate: " << name << ", "
239                                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
240                                 << string_human_readable_size(mem.memory_size()) << ")";
241                 }
242
243                 mem.device_pointer = mem.data_pointer;
244
245                 if(!mem.device_pointer) {
246                         mem.device_pointer = (device_ptr)malloc(mem.memory_size());
247                 }
248
249                 mem.device_size = mem.memory_size();
250                 stats.mem_alloc(mem.device_size);
251         }
252
253         void mem_copy_to(device_memory& /*mem*/)
254         {
255                 /* no-op */
256         }
257
258         void mem_copy_from(device_memory& /*mem*/,
259                            int /*y*/, int /*w*/, int /*h*/,
260                            int /*elem*/)
261         {
262                 /* no-op */
263         }
264
265         void mem_zero(device_memory& mem)
266         {
267                 memset((void*)mem.device_pointer, 0, mem.memory_size());
268         }
269
270         void mem_free(device_memory& mem)
271         {
272                 if(mem.device_pointer) {
273                         if(!mem.data_pointer) {
274                                 free((void*)mem.device_pointer);
275                         }
276
277                         mem.device_pointer = 0;
278                         stats.mem_free(mem.device_size);
279                         mem.device_size = 0;
280                 }
281         }
282
283         void const_copy_to(const char *name, void *host, size_t size)
284         {
285                 kernel_const_copy(&kernel_globals, name, host, size);
286         }
287
288         void tex_alloc(const char *name,
289                        device_memory& mem,
290                        InterpolationType interpolation,
291                        ExtensionType extension)
292         {
293                 VLOG(1) << "Texture allocate: " << name << ", "
294                         << string_human_readable_number(mem.memory_size()) << " bytes. ("
295                         << string_human_readable_size(mem.memory_size()) << ")";
296                 kernel_tex_copy(&kernel_globals,
297                                 name,
298                                 mem.data_pointer,
299                                 mem.data_width,
300                                 mem.data_height,
301                                 mem.data_depth,
302                                 interpolation,
303                                 extension);
304                 mem.device_pointer = mem.data_pointer;
305                 mem.device_size = mem.memory_size();
306                 stats.mem_alloc(mem.device_size);
307         }
308
309         void tex_free(device_memory& mem)
310         {
311                 if(mem.device_pointer) {
312                         mem.device_pointer = 0;
313                         stats.mem_free(mem.device_size);
314                         mem.device_size = 0;
315                 }
316         }
317
318         void *osl_memory()
319         {
320 #ifdef WITH_OSL
321                 return &osl_globals;
322 #else
323                 return NULL;
324 #endif
325         }
326
327         void thread_run(DeviceTask *task)
328         {
329                 if(task->type == DeviceTask::PATH_TRACE) {
330                         if(!use_split_kernel) {
331                                 thread_path_trace(*task);
332                         }
333                         else {
334                                 thread_path_trace_split(*task);
335                         }
336                 }
337                 else if(task->type == DeviceTask::FILM_CONVERT)
338                         thread_film_convert(*task);
339                 else if(task->type == DeviceTask::SHADER)
340                         thread_shader(*task);
341         }
342
343         class CPUDeviceTask : public DeviceTask {
344         public:
345                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
346                 : DeviceTask(task)
347                 {
348                         run = function_bind(&CPUDevice::thread_run, device, this);
349                 }
350         };
351
352         void thread_path_trace(DeviceTask& task)
353         {
354                 if(task_pool.canceled()) {
355                         if(task.need_finish_queue == false)
356                                 return;
357                 }
358
359                 KernelGlobals kg = thread_kernel_globals_init();
360                 RenderTile tile;
361
362                 void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
363
364 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
365                 if(system_cpu_support_avx2()) {
366                         path_trace_kernel = kernel_cpu_avx2_path_trace;
367                 }
368                 else
369 #endif
370 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
371                 if(system_cpu_support_avx()) {
372                         path_trace_kernel = kernel_cpu_avx_path_trace;
373                 }
374                 else
375 #endif
376 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
377                 if(system_cpu_support_sse41()) {
378                         path_trace_kernel = kernel_cpu_sse41_path_trace;
379                 }
380                 else
381 #endif
382 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
383                 if(system_cpu_support_sse3()) {
384                         path_trace_kernel = kernel_cpu_sse3_path_trace;
385                 }
386                 else
387 #endif
388 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
389                 if(system_cpu_support_sse2()) {
390                         path_trace_kernel = kernel_cpu_sse2_path_trace;
391                 }
392                 else
393 #endif
394                 {
395                         path_trace_kernel = kernel_cpu_path_trace;
396                 }
397
398                 while(task.acquire_tile(this, tile)) {
399                         float *render_buffer = (float*)tile.buffer;
400                         uint *rng_state = (uint*)tile.rng_state;
401                         int start_sample = tile.start_sample;
402                         int end_sample = tile.start_sample + tile.num_samples;
403
404                         for(int sample = start_sample; sample < end_sample; sample++) {
405                                 if(task.get_cancel() || task_pool.canceled()) {
406                                         if(task.need_finish_queue == false)
407                                                 break;
408                                 }
409
410                                 for(int y = tile.y; y < tile.y + tile.h; y++) {
411                                         for(int x = tile.x; x < tile.x + tile.w; x++) {
412                                                 path_trace_kernel(&kg, render_buffer, rng_state,
413                                                                   sample, x, y, tile.offset, tile.stride);
414                                         }
415                                 }
416
417                                 tile.sample = sample + 1;
418
419                                 task.update_progress(&tile, tile.w*tile.h);
420                         }
421
422                         task.release_tile(tile);
423
424                         if(task_pool.canceled()) {
425                                 if(task.need_finish_queue == false)
426                                         break;
427                         }
428                 }
429
430                 thread_kernel_globals_free(&kg);
431         }
432
433         void thread_path_trace_split(DeviceTask& task)
434         {
435                 if(task_pool.canceled()) {
436                         if(task.need_finish_queue == false)
437                                 return;
438                 }
439
440                 RenderTile tile;
441
442                 CPUSplitKernel split_kernel(this);
443
444                 /* allocate buffer for kernel globals */
445                 device_memory kgbuffer;
446                 kgbuffer.resize(sizeof(KernelGlobals));
447                 mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
448
449                 KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
450                 *kg = thread_kernel_globals_init();
451
452                 requested_features.max_closure = MAX_CLOSURE;
453                 if(!split_kernel.load_kernels(requested_features)) {
454                         thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
455                         mem_free(kgbuffer);
456
457                         return;
458                 }
459
460                 while(task.acquire_tile(this, tile)) {
461                         device_memory data;
462                         split_kernel.path_trace(&task, tile, kgbuffer, data);
463
464                         task.release_tile(tile);
465
466                         if(task_pool.canceled()) {
467                                 if(task.need_finish_queue == false)
468                                         break;
469                         }
470                 }
471
472                 thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
473                 mem_free(kgbuffer);
474         }
475
476         void thread_film_convert(DeviceTask& task)
477         {
478                 float sample_scale = 1.0f/(task.sample + 1);
479
480                 if(task.rgba_half) {
481                         void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
482 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
483                         if(system_cpu_support_avx2()) {
484                                 convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
485                         }
486                         else
487 #endif
488 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
489                         if(system_cpu_support_avx()) {
490                                 convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
491                         }
492                         else
493 #endif  
494 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
495                         if(system_cpu_support_sse41()) {
496                                 convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
497                         }
498                         else
499 #endif          
500 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3                
501                         if(system_cpu_support_sse3()) {
502                                 convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
503                         }
504                         else
505 #endif
506 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
507                         if(system_cpu_support_sse2()) {
508                                 convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
509                         }
510                         else
511 #endif
512                         {
513                                 convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
514                         }
515
516                         for(int y = task.y; y < task.y + task.h; y++)
517                                 for(int x = task.x; x < task.x + task.w; x++)
518                                         convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
519                                                 sample_scale, x, y, task.offset, task.stride);
520                 }
521                 else {
522                         void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
523 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
524                         if(system_cpu_support_avx2()) {
525                                 convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
526                         }
527                         else
528 #endif
529 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
530                         if(system_cpu_support_avx()) {
531                                 convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
532                         }
533                         else
534 #endif          
535 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
536                         if(system_cpu_support_sse41()) {
537                                 convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
538                         }
539                         else
540 #endif                  
541 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
542                         if(system_cpu_support_sse3()) {
543                                 convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
544                         }
545                         else
546 #endif
547 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
548                         if(system_cpu_support_sse2()) {
549                                 convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
550                         }
551                         else
552 #endif
553                         {
554                                 convert_to_byte_kernel = kernel_cpu_convert_to_byte;
555                         }
556
557                         for(int y = task.y; y < task.y + task.h; y++)
558                                 for(int x = task.x; x < task.x + task.w; x++)
559                                         convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
560                                                 sample_scale, x, y, task.offset, task.stride);
561
562                 }
563         }
564
565         void thread_shader(DeviceTask& task)
566         {
567                 KernelGlobals kg = kernel_globals;
568
569 #ifdef WITH_OSL
570                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
571 #endif
572                 void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
573
574 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
575                 if(system_cpu_support_avx2()) {
576                         shader_kernel = kernel_cpu_avx2_shader;
577                 }
578                 else
579 #endif
580 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
581                 if(system_cpu_support_avx()) {
582                         shader_kernel = kernel_cpu_avx_shader;
583                 }
584                 else
585 #endif
586 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
587                 if(system_cpu_support_sse41()) {
588                         shader_kernel = kernel_cpu_sse41_shader;
589                 }
590                 else
591 #endif
592 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
593                 if(system_cpu_support_sse3()) {
594                         shader_kernel = kernel_cpu_sse3_shader;
595                 }
596                 else
597 #endif
598 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
599                 if(system_cpu_support_sse2()) {
600                         shader_kernel = kernel_cpu_sse2_shader;
601                 }
602                 else
603 #endif
604                 {
605                         shader_kernel = kernel_cpu_shader;
606                 }
607
608                 for(int sample = 0; sample < task.num_samples; sample++) {
609                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
610                                 shader_kernel(&kg,
611                                               (uint4*)task.shader_input,
612                                               (float4*)task.shader_output,
613                                               (float*)task.shader_output_luma,
614                                               task.shader_eval_type,
615                                               task.shader_filter,
616                                               x,
617                                               task.offset,
618                                               sample);
619
620                         if(task.get_cancel() || task_pool.canceled())
621                                 break;
622
623                         task.update_progress(NULL);
624
625                 }
626
627 #ifdef WITH_OSL
628                 OSLShader::thread_free(&kg);
629 #endif
630         }
631
632         int get_split_task_count(DeviceTask& task)
633         {
634                 if(task.type == DeviceTask::SHADER)
635                         return task.get_subtask_count(TaskScheduler::num_threads(), 256);
636                 else
637                         return task.get_subtask_count(TaskScheduler::num_threads());
638         }
639
640         void task_add(DeviceTask& task)
641         {
642                 /* split task into smaller ones */
643                 list<DeviceTask> tasks;
644
645                 if(task.type == DeviceTask::SHADER)
646                         task.split(tasks, TaskScheduler::num_threads(), 256);
647                 else
648                         task.split(tasks, TaskScheduler::num_threads());
649
650                 foreach(DeviceTask& task, tasks)
651                         task_pool.push(new CPUDeviceTask(this, task));
652         }
653
654         void task_wait()
655         {
656                 task_pool.wait_work();
657         }
658
659         void task_cancel()
660         {
661                 task_pool.cancel();
662         }
663
664 protected:
665         inline KernelGlobals thread_kernel_globals_init()
666         {
667                 KernelGlobals kg = kernel_globals;
668                 kg.transparent_shadow_intersections = NULL;
669                 const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
670                                             sizeof(*kg.decoupled_volume_steps);
671                 for(int i = 0; i < decoupled_count; ++i) {
672                         kg.decoupled_volume_steps[i] = NULL;
673                 }
674                 kg.decoupled_volume_steps_index = 0;
675 #ifdef WITH_OSL
676                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
677 #endif
678                 return kg;
679         }
680
681         inline void thread_kernel_globals_free(KernelGlobals *kg)
682         {
683                 if(kg == NULL) {
684                         return;
685                 }
686
687                 if(kg->transparent_shadow_intersections != NULL) {
688                         free(kg->transparent_shadow_intersections);
689                 }
690                 const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
691                                             sizeof(*kg->decoupled_volume_steps);
692                 for(int i = 0; i < decoupled_count; ++i) {
693                         if(kg->decoupled_volume_steps[i] != NULL) {
694                                 free(kg->decoupled_volume_steps[i]);
695                         }
696                 }
697 #ifdef WITH_OSL
698                 OSLShader::thread_free(kg);
699 #endif
700         }
701
702         virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
703                 requested_features = requested_features_;
704
705                 return true;
706         }
707 };
708
709 /* split kernel */
710
711 class CPUSplitKernelFunction : public SplitKernelFunction {
712 public:
713         CPUDevice* device;
714         void (*func)(KernelGlobals *kg, KernelData *data);
715
716         CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
717         ~CPUSplitKernelFunction() {}
718
719         virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
720         {
721                 if(!func) {
722                         return false;
723                 }
724
725                 KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
726                 kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
727
728                 for(int y = 0; y < dim.global_size[1]; y++) {
729                         for(int x = 0; x < dim.global_size[0]; x++) {
730                                 kg->global_id = make_int2(x, y);
731
732                                 func(kg, (KernelData*)data.device_pointer);
733                         }
734                 }
735
736                 return true;
737         }
738 };
739
740 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
741 {
742 }
743
744 bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
745                                                     RenderTile& rtile,
746                                                     int num_global_elements,
747                                                     device_memory& kernel_globals,
748                                                     device_memory& data,
749                                                     device_memory& split_data,
750                                                     device_memory& ray_state,
751                                                     device_memory& queue_index,
752                                                     device_memory& use_queues_flags,
753                                                     device_memory& work_pool_wgs)
754 {
755         typedef void(*data_init_t)(KernelGlobals *kg,
756                                    ccl_constant KernelData *data,
757                                    ccl_global void *split_data_buffer,
758                                    int num_elements,
759                                    ccl_global char *ray_state,
760                                    ccl_global uint *rng_state,
761                                    int start_sample,
762                                    int end_sample,
763                                    int sx, int sy, int sw, int sh, int offset, int stride,
764                                    ccl_global int *Queue_index,
765                                    int queuesize,
766                                    ccl_global char *use_queues_flag,
767                                    ccl_global unsigned int *work_pool_wgs,
768                                    unsigned int num_samples,
769                                    ccl_global float *buffer);
770
771         data_init_t data_init;
772
773 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
774         if(system_cpu_support_avx2()) {
775                 data_init = kernel_cpu_avx2_data_init;
776         }
777         else
778 #endif
779 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
780         if(system_cpu_support_avx()) {
781                 data_init = kernel_cpu_avx_data_init;
782         }
783         else
784 #endif
785 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
786         if(system_cpu_support_sse41()) {
787                 data_init = kernel_cpu_sse41_data_init;
788         }
789         else
790 #endif
791 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
792         if(system_cpu_support_sse3()) {
793                 data_init = kernel_cpu_sse3_data_init;
794         }
795         else
796 #endif
797 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
798         if(system_cpu_support_sse2()) {
799                 data_init = kernel_cpu_sse2_data_init;
800         }
801         else
802 #endif
803         {
804                 data_init = kernel_cpu_data_init;
805         }
806
807         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
808         kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
809
810         for(int y = 0; y < dim.global_size[1]; y++) {
811                 for(int x = 0; x < dim.global_size[0]; x++) {
812                         kg->global_id = make_int2(x, y);
813
814                         data_init((KernelGlobals*)kernel_globals.device_pointer,
815                                   (KernelData*)data.device_pointer,
816                                   (void*)split_data.device_pointer,
817                                   num_global_elements,
818                                   (char*)ray_state.device_pointer,
819                                   (uint*)rtile.rng_state,
820                                   rtile.start_sample,
821                                   rtile.start_sample + rtile.num_samples,
822                                   rtile.x,
823                                   rtile.y,
824                                   rtile.w,
825                                   rtile.h,
826                                   rtile.offset,
827                                   rtile.stride,
828                                   (int*)queue_index.device_pointer,
829                                   dim.global_size[0] * dim.global_size[1],
830                                   (char*)use_queues_flags.device_pointer,
831                                   (uint*)work_pool_wgs.device_pointer,
832                                   rtile.num_samples,
833                                   (float*)rtile.buffer);
834                 }
835         }
836
837         return true;
838 }
839
840 SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
841 {
842         CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
843
844         kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
845         if(!kernel->func) {
846                 delete kernel;
847                 return NULL;
848         }
849
850         return kernel;
851 }
852
853 int2 CPUSplitKernel::split_kernel_local_size()
854 {
855         return make_int2(1, 1);
856 }
857
858 int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask *task) {
859         /* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */
860         return task->requested_tile_size;
861 }
862
863 size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
864         KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
865
866         return split_data_buffer_size(kg, num_threads);
867 }
868
869 unordered_map<string, void*> CPUDevice::kernel_functions;
870
871 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
872 {
873         return new CPUDevice(info, stats, background);
874 }
875
876 void device_cpu_info(vector<DeviceInfo>& devices)
877 {
878         DeviceInfo info;
879
880         info.type = DEVICE_CPU;
881         info.description = system_cpu_brand_string();
882         info.id = "CPU";
883         info.num = 0;
884         info.advanced_shading = true;
885         info.pack_images = false;
886
887         devices.insert(devices.begin(), info);
888 }
889
890 string device_cpu_capabilities(void)
891 {
892         string capabilities = "";
893         capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
894         capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
895         capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
896         capabilities += system_cpu_support_avx() ? "AVX " : "";
897         capabilities += system_cpu_support_avx2() ? "AVX2" : "";
898         if(capabilities[capabilities.size() - 1] == ' ')
899                 capabilities.resize(capabilities.size() - 1);
900         return capabilities;
901 }
902
903 CCL_NAMESPACE_END