Add support for multiple interpolation modes on cycles image textures
[blender-staging.git] / intern / cycles / device / device_cpu.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16
17 #include <stdlib.h>
18 #include <string.h>
19
20 #include "device.h"
21 #include "device_intern.h"
22
23 #include "kernel.h"
24 #include "kernel_compat_cpu.h"
25 #include "kernel_types.h"
26 #include "kernel_globals.h"
27
28 #include "osl_shader.h"
29 #include "osl_globals.h"
30
31 #include "buffers.h"
32
33 #include "util_debug.h"
34 #include "util_foreach.h"
35 #include "util_function.h"
36 #include "util_opengl.h"
37 #include "util_progress.h"
38 #include "util_system.h"
39 #include "util_thread.h"
40
41 CCL_NAMESPACE_BEGIN
42
43 class CPUDevice : public Device
44 {
45 public:
46         TaskPool task_pool;
47         KernelGlobals kernel_globals;
48
49 #ifdef WITH_OSL
50         OSLGlobals osl_globals;
51 #endif
52         
53         CPUDevice(DeviceInfo& info, Stats &stats, bool background)
54         : Device(info, stats, background)
55         {
56 #ifdef WITH_OSL
57                 kernel_globals.osl = &osl_globals;
58 #endif
59
60                 /* do now to avoid thread issues */
61                 system_cpu_support_sse2();
62                 system_cpu_support_sse3();
63                 system_cpu_support_sse41();
64                 system_cpu_support_avx();
65         }
66
67         ~CPUDevice()
68         {
69                 task_pool.stop();
70         }
71
72         void mem_alloc(device_memory& mem, MemoryType type)
73         {
74                 mem.device_pointer = mem.data_pointer;
75
76                 stats.mem_alloc(mem.memory_size());
77         }
78
79         void mem_copy_to(device_memory& mem)
80         {
81                 /* no-op */
82         }
83
84         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
85         {
86                 /* no-op */
87         }
88
89         void mem_zero(device_memory& mem)
90         {
91                 memset((void*)mem.device_pointer, 0, mem.memory_size());
92         }
93
94         void mem_free(device_memory& mem)
95         {
96                 mem.device_pointer = 0;
97
98                 stats.mem_free(mem.memory_size());
99         }
100
101         void const_copy_to(const char *name, void *host, size_t size)
102         {
103                 kernel_const_copy(&kernel_globals, name, host, size);
104         }
105
106         void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
107         {
108                 kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, interpolation);
109                 mem.device_pointer = mem.data_pointer;
110
111                 stats.mem_alloc(mem.memory_size());
112         }
113
114         void tex_free(device_memory& mem)
115         {
116                 mem.device_pointer = 0;
117
118                 stats.mem_free(mem.memory_size());
119         }
120
121         void *osl_memory()
122         {
123 #ifdef WITH_OSL
124                 return &osl_globals;
125 #else
126                 return NULL;
127 #endif
128         }
129
130         void thread_run(DeviceTask *task)
131         {
132                 if(task->type == DeviceTask::PATH_TRACE)
133                         thread_path_trace(*task);
134                 else if(task->type == DeviceTask::FILM_CONVERT)
135                         thread_film_convert(*task);
136                 else if(task->type == DeviceTask::SHADER)
137                         thread_shader(*task);
138         }
139
140         class CPUDeviceTask : public DeviceTask {
141         public:
142                 CPUDeviceTask(CPUDevice *device, DeviceTask& task)
143                 : DeviceTask(task)
144                 {
145                         run = function_bind(&CPUDevice::thread_run, device, this);
146                 }
147         };
148
149         void thread_path_trace(DeviceTask& task)
150         {
151                 if(task_pool.canceled()) {
152                         if(task.need_finish_queue == false)
153                                 return;
154                 }
155
156                 KernelGlobals kg = kernel_globals;
157
158 #ifdef WITH_OSL
159                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
160 #endif
161
162                 RenderTile tile;
163                 
164                 while(task.acquire_tile(this, tile)) {
165                         float *render_buffer = (float*)tile.buffer;
166                         uint *rng_state = (uint*)tile.rng_state;
167                         int start_sample = tile.start_sample;
168                         int end_sample = tile.start_sample + tile.num_samples;
169
170 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
171                         if(system_cpu_support_avx()) {
172                                 for(int sample = start_sample; sample < end_sample; sample++) {
173                                         if (task.get_cancel() || task_pool.canceled()) {
174                                                 if(task.need_finish_queue == false)
175                                                         break;
176                                         }
177
178                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
179                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
180                                                         kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state,
181                                                                 sample, x, y, tile.offset, tile.stride);
182                                                 }
183                                         }
184
185                                         tile.sample = sample + 1;
186
187                                         task.update_progress(tile);
188                                 }
189                         }
190                         else
191 #endif
192 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
193                         if(system_cpu_support_sse41()) {
194                                 for(int sample = start_sample; sample < end_sample; sample++) {
195                                         if (task.get_cancel() || task_pool.canceled()) {
196                                                 if(task.need_finish_queue == false)
197                                                         break;
198                                         }
199
200                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
201                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
202                                                         kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state,
203                                                                 sample, x, y, tile.offset, tile.stride);
204                                                 }
205                                         }
206
207                                         tile.sample = sample + 1;
208
209                                         task.update_progress(tile);
210                                 }
211                         }
212                         else
213 #endif
214 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
215                         if(system_cpu_support_sse3()) {
216                                 for(int sample = start_sample; sample < end_sample; sample++) {
217                                         if (task.get_cancel() || task_pool.canceled()) {
218                                                 if(task.need_finish_queue == false)
219                                                         break;
220                                         }
221
222                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
223                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
224                                                         kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
225                                                                 sample, x, y, tile.offset, tile.stride);
226                                                 }
227                                         }
228
229                                         tile.sample = sample + 1;
230
231                                         task.update_progress(tile);
232                                 }
233                         }
234                         else
235 #endif
236 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
237                         if(system_cpu_support_sse2()) {
238                                 for(int sample = start_sample; sample < end_sample; sample++) {
239                                         if (task.get_cancel() || task_pool.canceled()) {
240                                                 if(task.need_finish_queue == false)
241                                                         break;
242                                         }
243
244                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
245                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
246                                                         kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
247                                                                 sample, x, y, tile.offset, tile.stride);
248                                                 }
249                                         }
250
251                                         tile.sample = sample + 1;
252
253                                         task.update_progress(tile);
254                                 }
255                         }
256                         else
257 #endif
258                         {
259                                 for(int sample = start_sample; sample < end_sample; sample++) {
260                                         if (task.get_cancel() || task_pool.canceled()) {
261                                                 if(task.need_finish_queue == false)
262                                                         break;
263                                         }
264
265                                         for(int y = tile.y; y < tile.y + tile.h; y++) {
266                                                 for(int x = tile.x; x < tile.x + tile.w; x++) {
267                                                         kernel_cpu_path_trace(&kg, render_buffer, rng_state,
268                                                                 sample, x, y, tile.offset, tile.stride);
269                                                 }
270                                         }
271
272                                         tile.sample = sample + 1;
273
274                                         task.update_progress(tile);
275                                 }
276                         }
277
278                         task.release_tile(tile);
279
280                         if(task_pool.canceled()) {
281                                 if(task.need_finish_queue == false)
282                                         break;
283                         }
284                 }
285
286 #ifdef WITH_OSL
287                 OSLShader::thread_free(&kg);
288 #endif
289         }
290
291         void thread_film_convert(DeviceTask& task)
292         {
293                 float sample_scale = 1.0f/(task.sample + 1);
294
295                 if(task.rgba_half) {
296 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
297                         if(system_cpu_support_avx()) {
298                                 for(int y = task.y; y < task.y + task.h; y++)
299                                         for(int x = task.x; x < task.x + task.w; x++)
300                                                 kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
301                                                         sample_scale, x, y, task.offset, task.stride);
302                         }
303                         else
304 #endif  
305 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
306                         if(system_cpu_support_sse41()) {
307                                 for(int y = task.y; y < task.y + task.h; y++)
308                                         for(int x = task.x; x < task.x + task.w; x++)
309                                                 kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
310                                                         sample_scale, x, y, task.offset, task.stride);
311                         }
312                         else
313 #endif          
314 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3                
315                         if(system_cpu_support_sse3()) {
316                                 for(int y = task.y; y < task.y + task.h; y++)
317                                         for(int x = task.x; x < task.x + task.w; x++)
318                                                 kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
319                                                         sample_scale, x, y, task.offset, task.stride);
320                         }
321                         else
322 #endif
323 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
324                         if(system_cpu_support_sse2()) {
325                                 for(int y = task.y; y < task.y + task.h; y++)
326                                         for(int x = task.x; x < task.x + task.w; x++)
327                                                 kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
328                                                         sample_scale, x, y, task.offset, task.stride);
329                         }
330                         else
331 #endif
332                         {
333                                 for(int y = task.y; y < task.y + task.h; y++)
334                                         for(int x = task.x; x < task.x + task.w; x++)
335                                                 kernel_cpu_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
336                                                         sample_scale, x, y, task.offset, task.stride);
337                         }
338                 }
339                 else {
340 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
341                         if(system_cpu_support_avx()) {
342                                 for(int y = task.y; y < task.y + task.h; y++)
343                                         for(int x = task.x; x < task.x + task.w; x++)
344                                                 kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
345                                                         sample_scale, x, y, task.offset, task.stride);
346                         }
347                         else
348 #endif          
349 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
350                         if(system_cpu_support_sse41()) {
351                                 for(int y = task.y; y < task.y + task.h; y++)
352                                         for(int x = task.x; x < task.x + task.w; x++)
353                                                 kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
354                                                         sample_scale, x, y, task.offset, task.stride);
355                         }
356                         else
357 #endif                  
358 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
359                         if(system_cpu_support_sse3()) {
360                                 for(int y = task.y; y < task.y + task.h; y++)
361                                         for(int x = task.x; x < task.x + task.w; x++)
362                                                 kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
363                                                         sample_scale, x, y, task.offset, task.stride);
364                         }
365                         else
366 #endif
367 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
368                         if(system_cpu_support_sse2()) {
369                                 for(int y = task.y; y < task.y + task.h; y++)
370                                         for(int x = task.x; x < task.x + task.w; x++)
371                                                 kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
372                                                         sample_scale, x, y, task.offset, task.stride);
373                         }
374                         else
375 #endif
376                         {
377                                 for(int y = task.y; y < task.y + task.h; y++)
378                                         for(int x = task.x; x < task.x + task.w; x++)
379                                                 kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
380                                                         sample_scale, x, y, task.offset, task.stride);
381                         }
382                 }
383         }
384
385         void thread_shader(DeviceTask& task)
386         {
387                 KernelGlobals kg = kernel_globals;
388
389 #ifdef WITH_OSL
390                 OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
391 #endif
392
393 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
394                 if(system_cpu_support_avx()) {
395                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
396                                 kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
397
398                                 if(task_pool.canceled())
399                                         break;
400                         }
401                 }
402                 else
403 #endif
404 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41                       
405                 if(system_cpu_support_sse41()) {
406                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
407                                 kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
408
409                                 if(task_pool.canceled())
410                                         break;
411                         }
412                 }
413                 else
414 #endif
415 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
416                 if(system_cpu_support_sse3()) {
417                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
418                                 kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
419
420                                 if(task_pool.canceled())
421                                         break;
422                         }
423                 }
424                 else
425 #endif
426 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
427                 if(system_cpu_support_sse2()) {
428                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
429                                 kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
430
431                                 if(task_pool.canceled())
432                                         break;
433                         }
434                 }
435                 else
436 #endif
437                 {
438                         for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
439                                 kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
440
441                                 if(task_pool.canceled())
442                                         break;
443                         }
444                 }
445
446 #ifdef WITH_OSL
447                 OSLShader::thread_free(&kg);
448 #endif
449         }
450
451         void task_add(DeviceTask& task)
452         {
453                 /* split task into smaller ones */
454                 list<DeviceTask> tasks;
455                 task.split(tasks, TaskScheduler::num_threads());
456
457                 foreach(DeviceTask& task, tasks)
458                         task_pool.push(new CPUDeviceTask(this, task));
459         }
460
461         void task_wait()
462         {
463                 task_pool.wait_work();
464         }
465
466         void task_cancel()
467         {
468                 task_pool.cancel();
469         }
470 };
471
472 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
473 {
474         return new CPUDevice(info, stats, background);
475 }
476
477 void device_cpu_info(vector<DeviceInfo>& devices)
478 {
479         DeviceInfo info;
480
481         info.type = DEVICE_CPU;
482         info.description = system_cpu_brand_string();
483         info.id = "CPU";
484         info.num = 0;
485         info.advanced_shading = true;
486         info.pack_images = false;
487
488         devices.insert(devices.begin(), info);
489 }
490
491 CCL_NAMESPACE_END
492