Cycles: viewport render now takes scene color management settings into account,
authorBrecht Van Lommel <brechtvanlommel@pandora.be>
Fri, 30 Aug 2013 23:49:38 +0000 (23:49 +0000)
committerBrecht Van Lommel <brechtvanlommel@pandora.be>
Fri, 30 Aug 2013 23:49:38 +0000 (23:49 +0000)
except for curves, that's still missing from the OpenColorIO GLSL shader.

The pixels are stored in a half float texture, converterd from full float with
native GPU instructions and SIMD on the CPU, so it should be pretty quick.
Using a GLSL shader is useful for GPU render because it avoids a copy through
CPU memory.

25 files changed:
intern/cycles/blender/blender_session.cpp
intern/cycles/blender/blender_sync.cpp
intern/cycles/device/device.cpp
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_memory.h
intern/cycles/device/device_multi.cpp
intern/cycles/device/device_network.cpp
intern/cycles/device/device_network.h
intern/cycles/device/device_opencl.cpp
intern/cycles/device/device_task.cpp
intern/cycles/device/device_task.h
intern/cycles/kernel/kernel.cl
intern/cycles/kernel/kernel.cpp
intern/cycles/kernel/kernel.cu
intern/cycles/kernel/kernel.h
intern/cycles/kernel/kernel_compat_cuda.h
intern/cycles/kernel/kernel_film.h
intern/cycles/kernel/kernel_sse2.cpp
intern/cycles/kernel/kernel_sse3.cpp
intern/cycles/render/buffers.cpp
intern/cycles/render/buffers.h
intern/cycles/render/session.cpp
intern/cycles/render/session.h
intern/cycles/util/util_types.h

index 939eed2c77b778f353980fced82199714bfeb44c..2339006011838739348435d8023fd75aeb151fe5 100644 (file)
@@ -588,7 +588,15 @@ bool BlenderSession::draw(int w, int h)
        /* draw */
        BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
 
-       return !session->draw(buffer_params);
+       if(session->params.display_buffer_linear)
+               b_engine.bind_display_space_shader(b_scene);
+
+       bool draw_ok = !session->draw(buffer_params);
+
+       if(session->params.display_buffer_linear)
+               b_engine.unbind_display_space_shader();
+       
+       return draw_ok;
 }
 
 void BlenderSession::get_status(string& status, string& substatus)
index 4a686487462cd79f1e1989c6320e91da7c575bcf..58ce08665ef7d772053a551bf06d57513c56b38f 100644 (file)
@@ -492,6 +492,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use
                params.shadingsystem = SessionParams::SVM;
        else if(shadingsystem == 1)
                params.shadingsystem = SessionParams::OSL;
+       
+       /* color managagement */
+       params.display_buffer_linear = b_engine.support_display_space_shader(b_scene);
 
        return params;
 }
index e42f83be6ceac7e966fd2fb26f32e86fda945abb..10d4112b57db57a64141cf6f6ced3a9ba5fa5e5f 100644 (file)
@@ -41,7 +41,10 @@ void Device::pixels_alloc(device_memory& mem)
 
 void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
 {
-       mem_copy_from(mem, y, w, h, sizeof(uint8_t)*4);
+       if(mem.data_type == TYPE_HALF)
+               mem_copy_from(mem, y, w, h, sizeof(half4));
+       else
+               mem_copy_from(mem, y, w, h, sizeof(uchar4));
 }
 
 void Device::pixels_free(device_memory& mem)
@@ -53,27 +56,49 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 {
        pixels_copy_from(rgba, y, w, h);
 
+       GLuint texid;
+       glGenTextures(1, &texid);
+       glBindTexture(GL_TEXTURE_2D, texid);
+       if(rgba.data_type == TYPE_HALF)
+               glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, (void*)rgba.data_pointer);
+       else
+               glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, (void*)rgba.data_pointer);
+       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+       glEnable(GL_TEXTURE_2D);
+       
        if(transparent) {
                glEnable(GL_BLEND);
                glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
        }
 
-       glPixelZoom((float)width/(float)w, (float)height/(float)h);
-       glRasterPos2f(0, dy);
+       glColor3f(1.0f, 1.0f, 1.0f);
 
-       uint8_t *pixels = (uint8_t*)rgba.data_pointer;
+       glPushMatrix();
+       glTranslatef(0.0f, (float)dy, 0.0f);
 
-       /* for multi devices, this assumes the ineffecient method that we allocate
-        * all pixels on the device even though we only render to a subset */
-       pixels += 4*y*w;
+       glBegin(GL_QUADS);
+       
+       glTexCoord2f(0.0f, 0.0f);
+       glVertex2f(0.0f, 0.0f);
+       glTexCoord2f(1.0f, 0.0f);
+       glVertex2f((float)width, 0.0f);
+       glTexCoord2f(1.0f, 1.0f);
+       glVertex2f((float)width, (float)height);
+       glTexCoord2f(0.0f, 1.0f);
+       glVertex2f(0.0f, (float)height);
 
-       glDrawPixels(w, h, GL_RGBA, GL_UNSIGNED_BYTE, pixels);
+       glEnd();
 
-       glRasterPos2f(0.0f, 0.0f);
-       glPixelZoom(1.0f, 1.0f);
+       glPopMatrix();
 
        if(transparent)
                glDisable(GL_BLEND);
+
+       glBindTexture(GL_TEXTURE_2D, 0);
+       glDisable(GL_TEXTURE_2D);
+       glDeleteTextures(1, &texid);
 }
 
 Device *Device::create(DeviceInfo& info, Stats &stats, bool background)
index d9c08dadbb00096e48c6812c94fca2484d19ae73..b1dbdec9d361e5d2d5543c7bb31e0c9e090d7b10 100644 (file)
@@ -127,8 +127,8 @@ public:
        {
                if(task->type == DeviceTask::PATH_TRACE)
                        thread_path_trace(*task);
-               else if(task->type == DeviceTask::TONEMAP)
-                       thread_tonemap(*task);
+               else if(task->type == DeviceTask::FILM_CONVERT)
+                       thread_film_convert(*task);
                else if(task->type == DeviceTask::SHADER)
                        thread_shader(*task);
        }
@@ -237,28 +237,55 @@ public:
 #endif
        }
 
-       void thread_tonemap(DeviceTask& task)
+       void thread_film_convert(DeviceTask& task)
        {
+               float sample_scale = 1.0f/(task.sample + 1);
+
+               if(task.rgba_half) {
 #ifdef WITH_OPTIMIZED_KERNEL
-               if(system_cpu_support_sse3()) {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, x, y, task.offset, task.stride);
-               }
-               else if(system_cpu_support_sse2()) {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, x, y, task.offset, task.stride);
+                       if(system_cpu_support_sse3()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else if(system_cpu_support_sse2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
+#endif
+                       {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
                }
-               else
+               else {
+#ifdef WITH_OPTIMIZED_KERNEL
+                       if(system_cpu_support_sse3()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else if(system_cpu_support_sse2()) {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
+                       else
 #endif
-               {
-                       for(int y = task.y; y < task.y + task.h; y++)
-                               for(int x = task.x; x < task.x + task.w; x++)
-                                       kernel_cpu_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
-                                               task.sample, x, y, task.offset, task.stride);
+                       {
+                               for(int y = task.y; y < task.y + task.h; y++)
+                                       for(int x = task.x; x < task.x + task.w; x++)
+                                               kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+                                                       sample_scale, x, y, task.offset, task.stride);
+                       }
                }
        }
 
index c1b5a8bfcea591ce9ba8d0c00c609cd71cb17d1f..b5eaa69bf0e8e2343baaeb1a98fdb06a1f6274c0 100644 (file)
@@ -625,7 +625,7 @@ public:
                cuda_pop_context();
        }
 
-       void tonemap(DeviceTask& task, device_ptr buffer, device_ptr rgba)
+       void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
        {
                if(have_error())
                        return;
@@ -633,11 +633,14 @@ public:
                cuda_push_context();
 
                CUfunction cuFilmConvert;
-               CUdeviceptr d_rgba = map_pixels(rgba);
+               CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
                CUdeviceptr d_buffer = cuda_device_ptr(buffer);
 
                /* get kernel function */
-               cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
+               if(rgba_half)
+                       cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"))
+               else
+                       cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"))
 
                /* pass in parameters */
                int offset = 0;
@@ -648,11 +651,11 @@ public:
                cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
                offset += sizeof(d_buffer);
 
-               int sample = task.sample;
-               offset = align_up(offset, __alignof(sample));
+               float sample_scale = 1.0f/(task.sample + 1);
+               offset = align_up(offset, __alignof(sample_scale));
 
-               cuda_assert(cuParamSeti(cuFilmConvert, offset, task.sample))
-               offset += sizeof(task.sample);
+               cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale))
+               offset += sizeof(sample_scale);
 
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
                offset += sizeof(task.x);
@@ -684,7 +687,7 @@ public:
                cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
                cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
 
-               unmap_pixels(task.rgba);
+               unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
 
                cuda_pop_context();
        }
@@ -771,13 +774,19 @@ public:
 
                        glGenBuffers(1, &pmem.cuPBO);
                        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-                       glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
+                       if(mem.data_type == TYPE_HALF)
+                               glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
+                       else
+                               glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
                        
                        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
                        
                        glGenTextures(1, &pmem.cuTexId);
                        glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+                       if(mem.data_type == TYPE_HALF)
+                               glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+                       else
+                               glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
                        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
                        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
                        glBindTexture(GL_TEXTURE_2D, 0);
@@ -865,11 +874,19 @@ public:
 
                        /* for multi devices, this assumes the ineffecient method that we allocate
                         * all pixels on the device even though we only render to a subset */
-                       size_t offset = sizeof(uint8_t)*4*y*w;
+                       size_t offset = 4*y*w;
+
+                       if(mem.data_type == TYPE_HALF)
+                               offset *= sizeof(GLhalf);
+                       else
+                               offset *= sizeof(uint8_t);
 
                        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
                        glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-                       glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
+                       if(mem.data_type == TYPE_HALF)
+                               glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset);
+                       else
+                               glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
                        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
                        
                        glEnable(GL_TEXTURE_2D);
@@ -961,9 +978,9 @@ public:
 
        void task_add(DeviceTask& task)
        {
-               if(task.type == DeviceTask::TONEMAP) {
+               if(task.type == DeviceTask::FILM_CONVERT) {
                        /* must be done in main thread due to opengl access */
-                       tonemap(task, task.buffer, task.rgba);
+                       film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
 
                        cuda_push_context();
                        cuda_assert(cuCtxSynchronize())
index d27dd19cc9674904cbbcf1b180f728863ffa6bfe..18e6242d23dc1f81fecf136b4115825ad74b1f79 100644 (file)
@@ -46,7 +46,8 @@ enum DataType {
        TYPE_UCHAR,
        TYPE_UINT,
        TYPE_INT,
-       TYPE_FLOAT
+       TYPE_FLOAT,
+       TYPE_HALF
 };
 
 static inline size_t datatype_size(DataType datatype) 
@@ -56,6 +57,7 @@ static inline size_t datatype_size(DataType datatype)
                case TYPE_FLOAT: return sizeof(float);
                case TYPE_UINT: return sizeof(uint);
                case TYPE_INT: return sizeof(int);
+               case TYPE_HALF: return sizeof(half);
                default: return 0;
        }
 }
@@ -147,6 +149,11 @@ template<> struct device_type_traits<float4> {
        static const int num_elements = 4;
 };
 
+template<> struct device_type_traits<half4> {
+       static const DataType data_type = TYPE_HALF;
+       static const int num_elements = 4;
+};
+
 /* Device Memory */
 
 class device_memory
index af6ca9e1fbddb456333201e0255d988e26c6b521..4df0fdbd4c7f5a5835a29a13a3e9f617c0af1bb3 100644 (file)
@@ -261,7 +261,6 @@ public:
                        if(sub.device == sub_device) {
                                if(tile.buffer) tile.buffer = sub.ptr_map[tile.buffer];
                                if(tile.rng_state) tile.rng_state = sub.ptr_map[tile.rng_state];
-                               if(tile.rgba) tile.rgba = sub.ptr_map[tile.rgba];
                        }
                }
        }
@@ -290,7 +289,8 @@ public:
                                tasks.pop_front();
 
                                if(task.buffer) subtask.buffer = sub.ptr_map[task.buffer];
-                               if(task.rgba) subtask.rgba = sub.ptr_map[task.rgba];
+                               if(task.rgba_byte) subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
+                               if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
                                if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
                                if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
 
index 521739b8ef117a0ca9fac5d34822b0b1c115bd6c..23c1a10fa0a673876ed06a69309af31791f69b7d 100644 (file)
@@ -408,7 +408,8 @@ protected:
                        rcv.read(task);
 
                        if(task.buffer) task.buffer = ptr_map[task.buffer];
-                       if(task.rgba) task.rgba = ptr_map[task.rgba];
+                       if(task.rgba_byte) task.rgba_byte = ptr_map[task.rgba_byte];
+                       if(task.rgba_half) task.rgba_half = ptr_map[task.rgba_half];
                        if(task.shader_input) task.shader_input = ptr_map[task.shader_input];
                        if(task.shader_output) task.shader_output = ptr_map[task.shader_output];
 
@@ -448,7 +449,6 @@ protected:
 
                                if(tile.buffer) tile.buffer = ptr_map[tile.buffer];
                                if(tile.rng_state) tile.rng_state = ptr_map[tile.rng_state];
-                               if(tile.rgba) tile.rgba = ptr_map[tile.rgba];
 
                                result = true;
                                break;
@@ -478,7 +478,6 @@ protected:
 
                if(tile.buffer) tile.buffer = ptr_imap[tile.buffer];
                if(tile.rng_state) tile.rng_state = ptr_imap[tile.rng_state];
-               if(tile.rgba) tile.rgba = ptr_imap[tile.rgba];
 
                RPCSend snd(socket, "release_tile");
                snd.add(tile);
index 5fe574fd4d4b3c5e2ec9c2e5f0ee7a793d04cdef..db399cf424013ec7031e875ed081909c6c9c3152 100644 (file)
@@ -94,7 +94,7 @@ public:
                int type = (int)task.type;
 
                archive & type & task.x & task.y & task.w & task.h;
-               archive & task.rgba & task.buffer & task.sample & task.num_samples;
+               archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
                archive & task.offset & task.stride;
                archive & task.shader_input & task.shader_output & task.shader_eval_type;
                archive & task.shader_x & task.shader_w;
@@ -105,7 +105,7 @@ public:
                archive & tile.x & tile.y & tile.w & tile.h;
                archive & tile.start_sample & tile.num_samples & tile.sample;
                archive & tile.offset & tile.stride;
-               archive & tile.buffer & tile.rng_state & tile.rgba;
+               archive & tile.buffer & tile.rng_state;
        }
 
        void write()
@@ -234,7 +234,7 @@ public:
                int type;
 
                *archive & type & task.x & task.y & task.w & task.h;
-               *archive & task.rgba & task.buffer & task.sample & task.num_samples;
+               *archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
                *archive & task.resolution & task.offset & task.stride;
                *archive & task.shader_input & task.shader_output & task.shader_eval_type;
                *archive & task.shader_x & task.shader_w;
@@ -247,7 +247,7 @@ public:
                *archive & tile.x & tile.y & tile.w & tile.h;
                *archive & tile.start_sample & tile.num_samples & tile.sample;
                *archive & tile.resolution & tile.offset & tile.stride;
-               *archive & tile.buffer & tile.rng_state & tile.rgba;
+               *archive & tile.buffer & tile.rng_state & tile.rgba_byte & tile.rgba_half;
 
                tile.buffers = NULL;
        }
index e800b3f64426eaec269fd63483d0fa6b086a353a..d723df70c894f9d55eae71fb8297d2acf2db5f80 100644 (file)
@@ -321,7 +321,8 @@ public:
        cl_device_id cdDevice;
        cl_program cpProgram;
        cl_kernel ckPathTraceKernel;
-       cl_kernel ckFilmConvertKernel;
+       cl_kernel ckFilmConvertByteKernel;
+       cl_kernel ckFilmConvertHalfFloatKernel;
        cl_kernel ckShaderKernel;
        cl_int ciErr;
 
@@ -431,7 +432,8 @@ public:
                cqCommandQueue = NULL;
                cpProgram = NULL;
                ckPathTraceKernel = NULL;
-               ckFilmConvertKernel = NULL;
+               ckFilmConvertByteKernel = NULL;
+               ckFilmConvertHalfFloatKernel = NULL;
                ckShaderKernel = NULL;
                null_mem = 0;
                device_initialized = false;
@@ -762,7 +764,11 @@ public:
                if(opencl_error(ciErr))
                        return false;
 
-               ckFilmConvertKernel = clCreateKernel(cpProgram, "kernel_ocl_tonemap", &ciErr);
+               ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr);
+               if(opencl_error(ciErr))
+                       return false;
+
+               ckFilmConvertHalfFloatKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_half_float", &ciErr);
                if(opencl_error(ciErr))
                        return false;
 
@@ -788,8 +794,10 @@ public:
 
                if(ckPathTraceKernel)
                        clReleaseKernel(ckPathTraceKernel);  
-               if(ckFilmConvertKernel)
-                       clReleaseKernel(ckFilmConvertKernel);  
+               if(ckFilmConvertByteKernel)
+                       clReleaseKernel(ckFilmConvertByteKernel);  
+               if(ckFilmConvertHalfFloatKernel)
+                       clReleaseKernel(ckFilmConvertHalfFloatKernel);  
                if(cpProgram)
                        clReleaseProgram(cpProgram);
                if(cqCommandQueue)
@@ -980,17 +988,17 @@ public:
                return err;
        }
 
-       void tonemap(DeviceTask& task, device_ptr buffer, device_ptr rgba)
+       void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
        {
                /* cast arguments to cl types */
                cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-               cl_mem d_rgba = CL_MEM_PTR(rgba);
+               cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
                cl_mem d_buffer = CL_MEM_PTR(buffer);
                cl_int d_x = task.x;
                cl_int d_y = task.y;
                cl_int d_w = task.w;
                cl_int d_h = task.h;
-               cl_int d_sample = task.sample;
+               cl_float d_sample_scale = 1.0f/(task.sample + 1);
                cl_int d_offset = task.offset;
                cl_int d_stride = task.stride;
 
@@ -998,6 +1006,8 @@ public:
                cl_uint narg = 0;
                ciErr = 0;
 
+               cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
+
                ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data);
                ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba);
                ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
@@ -1006,7 +1016,7 @@ public:
        ciErr |= set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
 #include "kernel_textures.h"
 
-               ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample), (void*)&d_sample);
+               ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale);
                ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x);
                ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y);
                ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w);
@@ -1052,8 +1062,8 @@ public:
 
        void thread_run(DeviceTask *task)
        {
-               if(task->type == DeviceTask::TONEMAP) {
-                       tonemap(*task, task->buffer, task->rgba);
+               if(task->type == DeviceTask::FILM_CONVERT) {
+                       film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
                }
                else if(task->type == DeviceTask::SHADER) {
                        shader(*task);
index 8c1e29206352f2cfbd17ffa9146eda64a39ea3c3..7d0eeab780dbfd6fd8b89aa82550f752c677d54b 100644 (file)
@@ -27,7 +27,7 @@ CCL_NAMESPACE_BEGIN
 /* Device Task */
 
 DeviceTask::DeviceTask(Type type_)
-: type(type_), x(0), y(0), w(0), h(0), rgba(0), buffer(0),
+: type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
   sample(0), num_samples(1),
   shader_input(0), shader_output(0),
   shader_eval_type(0), shader_x(0), shader_w(0)
index e232e128827193089937b65a237230813ce56142..c1bd39b70ca41c3fc80799ff8db36c00235e4b78 100644 (file)
@@ -34,11 +34,12 @@ class Tile;
 
 class DeviceTask : public Task {
 public:
-       typedef enum { PATH_TRACE, TONEMAP, SHADER } Type;
+       typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
        Type type;
 
        int x, y, w, h;
-       device_ptr rgba;
+       device_ptr rgba_byte;
+       device_ptr rgba_half;
        device_ptr buffer;
        int sample;
        int num_samples;
index dd8ffdd2b332c9accc5b88d367570eeaba81f281..28e72d78731ef03b32d7d97e2a0690bcba29032e 100644 (file)
@@ -52,7 +52,7 @@ __kernel void kernel_ocl_path_trace(
                kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_tonemap(
+__kernel void kernel_ocl_convert_to_byte(
        __constant KernelData *data,
        __global uchar4 *rgba,
        __global float *buffer,
@@ -61,7 +61,34 @@ __kernel void kernel_ocl_tonemap(
        __global type *name,
 #include "kernel_textures.h"
 
-       int sample,
+       float sample_scale,
+       int sx, int sy, int sw, int sh, int offset, int stride)
+{
+       KernelGlobals kglobals, *kg = &kglobals;
+
+       kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+       kg->name = name;
+#include "kernel_textures.h"
+
+       int x = sx + get_global_id(0);
+       int y = sy + get_global_id(1);
+
+       if(x < sx + sw && y < sy + sh)
+               kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+__kernel void kernel_ocl_convert_to_half_float(
+       __constant KernelData *data,
+       __global uchar4 *rgba,
+       __global float *buffer,
+
+#define KERNEL_TEX(type, ttype, name) \
+       __global type *name,
+#include "kernel_textures.h"
+
+       float sample_scale,
        int sx, int sy, int sw, int sh, int offset, int stride)
 {
        KernelGlobals kglobals, *kg = &kglobals;
@@ -76,7 +103,7 @@ __kernel void kernel_ocl_tonemap(
        int y = sy + get_global_id(1);
 
        if(x < sx + sw && y < sy + sh)
-               kernel_film_tonemap(kg, rgba, buffer, sample, x, y, offset, stride);
+               kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
 __kernel void kernel_ocl_shader(
index 3f357763a8f7aa776ae91c6bcf288ebefc9d654c..3e2727fde9a588ca78a94d7d37dd09a613dc0935 100644 (file)
@@ -96,11 +96,16 @@ void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_s
                kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
-/* Tonemapping */
+/* Film */
 
-void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int x, int y, int offset, int stride)
+void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
 {
-       kernel_film_tonemap(kg, rgba, buffer, sample, x, y, offset, stride);
+       kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+       kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
 /* Shader Evaluation */
index c4da1d440b7bdc46538ecbffa68abc7ca3e8f7dc..5e6748c66fcb2a4e88416c80584aedc415d7ec93 100644 (file)
@@ -44,13 +44,22 @@ extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *
 }
 #endif
 
-extern "C" __global__ void kernel_cuda_tonemap(uchar4 *rgba, float *buffer, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
        int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
        int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
        if(x < sx + sw && y < sy + sh)
-               kernel_film_tonemap(NULL, rgba, buffer, sample, x, y, offset, stride);
+               kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+       int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+       int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+       if(x < sx + sw && y < sy + sh)
+               kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
 extern "C" __global__ void kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
index 6efc28ed2af63e2227e30b8d4f594dda6034ad25..361f5b0856d2ac1ff94313a39f54efeb9cd438a1 100644 (file)
@@ -36,23 +36,29 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 
 void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
        int sample, int x, int y, int offset, int stride);
-void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
-       int sample, int x, int y, int offset, int stride);
+void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
        int type, int i);
 
 #ifdef WITH_OPTIMIZED_KERNEL
 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
        int sample, int x, int y, int offset, int stride);
-void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
-       int sample, int x, int y, int offset, int stride);
+void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
        int type, int i);
 
 void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
        int sample, int x, int y, int offset, int stride);
-void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
-       int sample, int x, int y, int offset, int stride);
+void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+       float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
        int type, int i);
 #endif
index a6bf4a969751cba1ccec0eb96d960f25991e6343..cb86ce8c4ae024a5f317adbc9c45034696711428 100644 (file)
@@ -25,8 +25,6 @@
 #include <cuda.h>
 #include <float.h>
 
-#include "util_types.h"
-
 /* Qualifier wrappers for different names on different devices */
 
 #define __device  __device__ __inline__
 
 #define kernel_assert(cond)
 
+/* Types */
+
+#include "util_types.h"
+
 /* Textures */
 
 typedef texture<float4, 1> texture_float4;
index ba2149cc709fd889844d43895a019608c449f343..3ef33a2703bfc91ef4e3c7162fa43735b55ce45f 100644 (file)
@@ -16,9 +16,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float4 film_map(KernelGlobals *kg, float4 irradiance, int sample)
+__device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale)
 {
-       float scale = 1.0f/(float)(sample+1);
        float exposure = kernel_data.film.exposure;
        float4 result = irradiance*scale;
 
@@ -46,9 +45,9 @@ __device uchar4 film_float_to_byte(float4 color)
        return result;
 }
 
-__device void kernel_film_tonemap(KernelGlobals *kg,
+__device void kernel_film_convert_to_byte(KernelGlobals *kg,
        __global uchar4 *rgba, __global float *buffer,
-       int sample, int x, int y, int offset, int stride)
+       float sample_scale, int x, int y, int offset, int stride)
 {
        /* buffer offset */
        int index = offset + x + y*stride;
@@ -58,11 +57,25 @@ __device void kernel_film_tonemap(KernelGlobals *kg,
 
        /* map colors */
        float4 irradiance = *((__global float4*)buffer);
-       float4 float_result = film_map(kg, irradiance, sample);
+       float4 float_result = film_map(kg, irradiance, sample_scale);
        uchar4 byte_result = film_float_to_byte(float_result);
 
        *rgba = byte_result;
 }
 
+__device void kernel_film_convert_to_half_float(KernelGlobals *kg,
+       __global uchar4 *rgba, __global float *buffer,
+       float sample_scale, int x, int y, int offset, int stride)
+{
+       /* buffer offset */
+       int index = offset + x + y*stride;
+
+       float4 *in = (__global float4*)(buffer + index*kernel_data.film.pass_stride);
+       half *out = (half*)rgba + index*4;
+       float scale = kernel_data.film.exposure*sample_scale;
+
+       float4_store_half(out, in, scale);
+}
+
 CCL_NAMESPACE_END
 
index e0413ddf44574d6e48c04d701a3f072a193d7bee..862626d6899a30b320ba183ae0afb4b25fa7cd95 100644 (file)
@@ -45,11 +45,16 @@ void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *
                kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
-/* Tonemapping */
+/* Film */
 
-void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int x, int y, int offset, int stride)
+void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
 {
-       kernel_film_tonemap(kg, rgba, buffer, sample, x, y, offset, stride);
+       kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+       kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
 /* Shader Evaluate */
index 86f4705ca1801aaabaf2afb8db083cf0c8514dbf..c44098606a5e2bdf8a6683a70fbb04bccb803884 100644 (file)
@@ -47,11 +47,16 @@ void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *
                kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
-/* Tonemapping */
+/* Film */
 
-void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int x, int y, int offset, int stride)
+void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
 {
-       kernel_film_tonemap(kg, rgba, buffer, sample, x, y, offset, stride);
+       kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+       kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
 /* Shader Evaluate */
index e0bc3f40c4e141c4921c86d0f61285dff772ff4e..5fb648cec5fc3c15780667fb60aeef5e683074f2 100644 (file)
@@ -91,7 +91,6 @@ RenderTile::RenderTile()
 
        buffer = 0;
        rng_state = 0;
-       rgba = 0;
 
        buffers = NULL;
 }
@@ -298,12 +297,13 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 
 /* Display Buffer */
 
-DisplayBuffer::DisplayBuffer(Device *device_)
+DisplayBuffer::DisplayBuffer(Device *device_, bool linear)
 {
        device = device_;
        draw_width = 0;
        draw_height = 0;
        transparent = true; /* todo: determine from background */
+       half_float = linear;
 }
 
 DisplayBuffer::~DisplayBuffer()
@@ -313,9 +313,13 @@ DisplayBuffer::~DisplayBuffer()
 
 void DisplayBuffer::device_free()
 {
-       if(rgba.device_pointer) {
-               device->pixels_free(rgba);
-               rgba.clear();
+       if(rgba_byte.device_pointer) {
+               device->pixels_free(rgba_byte);
+               rgba_byte.clear();
+       }
+       if(rgba_half.device_pointer) {
+               device->pixels_free(rgba_half);
+               rgba_half.clear();
        }
 }
 
@@ -330,8 +334,14 @@ void DisplayBuffer::reset(Device *device, BufferParams& params_)
        device_free();
 
        /* allocate display pixels */
-       rgba.resize(params.width, params.height);
-       device->pixels_alloc(rgba);
+       if(half_float) {
+               rgba_half.resize(params.width, params.height);
+               device->pixels_alloc(rgba_half);
+       }
+       else {
+               rgba_byte.resize(params.width, params.height);
+               device->pixels_alloc(rgba_byte);
+       }
 }
 
 void DisplayBuffer::draw_set(int width, int height)
@@ -347,6 +357,7 @@ void DisplayBuffer::draw(Device *device)
        if(draw_width != 0 && draw_height != 0) {
                glPushMatrix();
                glTranslatef(params.full_x, params.full_y, 0.0f);
+               device_memory& rgba = rgba_data();
 
                device->draw_pixels(rgba, 0, draw_width, draw_height, 0, params.width, params.height, transparent);
 
@@ -366,8 +377,12 @@ void DisplayBuffer::write(Device *device, const string& filename)
 
        if(w == 0 || h == 0)
                return;
+       
+       if(half_float)
+               return;
 
        /* read buffer from device */
+       device_memory& rgba = rgba_data();
        device->pixels_copy_from(rgba, 0, w, h);
 
        /* write image */
@@ -389,5 +404,13 @@ void DisplayBuffer::write(Device *device, const string& filename)
        delete out;
 }
 
+device_memory& DisplayBuffer::rgba_data()
+{
+       if(half_float)
+               return rgba_half;
+       else
+               return rgba_byte;
+}
+
 CCL_NAMESPACE_END
 
index 2936a224d43ccde3aa3763159315a2c05ccec402..0b1f9010e755577ba2d73b52ebbe1ca92e66b0a2 100644 (file)
@@ -87,8 +87,8 @@ protected:
 
 /* Display Buffer
  *
- * The buffer used for drawing during render, filled by tonemapping the render
- * buffers and converting to uchar4 storage. */
+ * The buffer used for drawing during render, filled by converting the render
+ * buffers to byte of half float storage */
 
 class DisplayBuffer {
 public:
@@ -100,10 +100,13 @@ public:
        int draw_width, draw_height;
        /* draw alpha channel? */
        bool transparent;
-       /* byte buffer for tonemapped result */
-       device_vector<uchar4> rgba;
+       /* use half float? */
+       bool half_float;
+       /* byte buffer for converted result */
+       device_vector<uchar4> rgba_byte;
+       device_vector<half4> rgba_half;
 
-       DisplayBuffer(Device *device);
+       DisplayBuffer(Device *device, bool linear = false);
        ~DisplayBuffer();
 
        void reset(Device *device, BufferParams& params);
@@ -113,6 +116,8 @@ public:
        void draw(Device *device);
        bool draw_ready();
 
+       device_memory& rgba_data();
+
 protected:
        void device_free();
 
@@ -134,7 +139,6 @@ public:
 
        device_ptr buffer;
        device_ptr rng_state;
-       device_ptr rgba;
 
        RenderBuffers *buffers;
 
index 01f8a950c8f6f8d9786245f52c25b2a6054a939c..d18223d7ab37079145e990efed2bfed87a02e6e3 100644 (file)
@@ -56,7 +56,7 @@ Session::Session(const SessionParams& params_)
        }
        else {
                buffers = new RenderBuffers(device);
-               display = new DisplayBuffer(device);
+               display = new DisplayBuffer(device, params.display_buffer_linear);
        }
 
        session_thread = NULL;
@@ -371,7 +371,6 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 
                rtile.buffer = buffers->buffer.device_pointer;
                rtile.rng_state = buffers->rng_state.device_pointer;
-               rtile.rgba = display->rgba.device_pointer;
                rtile.buffers = buffers;
 
                device->map_tile(tile_device, rtile);
@@ -415,7 +414,6 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 
        rtile.buffer = tilebuffers->buffer.device_pointer;
        rtile.rng_state = tilebuffers->rng_state.device_pointer;
-       rtile.rgba = 0;
        rtile.buffers = tilebuffers;
 
        /* this will tag tile as IN PROGRESS in blender-side render pipeline,
@@ -838,13 +836,14 @@ void Session::path_trace()
 void Session::tonemap()
 {
        /* add tonemap task */
-       DeviceTask task(DeviceTask::TONEMAP);
+       DeviceTask task(DeviceTask::FILM_CONVERT);
 
        task.x = tile_manager.state.buffer.full_x;
        task.y = tile_manager.state.buffer.full_y;
        task.w = tile_manager.state.buffer.width;
        task.h = tile_manager.state.buffer.height;
-       task.rgba = display->rgba.device_pointer;
+       task.rgba_byte = display->rgba_byte.device_pointer;
+       task.rgba_half = display->rgba_half.device_pointer;
        task.buffer = buffers->buffer.device_pointer;
        task.sample = tile_manager.state.sample;
        tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
index 0874bfee780c4c328913e8a7a6db7921d644aefe..8cff64def4e8fd241953b91c22aeb557ffca4a86 100644 (file)
@@ -53,6 +53,8 @@ public:
        int start_resolution;
        int threads;
 
+       bool display_buffer_linear;
+
        double cancel_timeout;
        double reset_timeout;
        double text_timeout;
@@ -72,6 +74,8 @@ public:
                start_resolution = INT_MAX;
                threads = 0;
 
+               display_buffer_linear = false;
+
                cancel_timeout = 0.1;
                reset_timeout = 0.1;
                text_timeout = 1.0;
@@ -91,6 +95,7 @@ public:
                && tile_size == params.tile_size
                && start_resolution == params.start_resolution
                && threads == params.threads
+               && display_buffer_linear == params.display_buffer_linear
                && cancel_timeout == params.cancel_timeout
                && reset_timeout == params.reset_timeout
                && text_timeout == params.text_timeout
index 758f39a76b280bda45b7a67c7832b964bc8b4960..f48fd1e124b8031f0d96237bd01a3f67f7057b19 100644 (file)
@@ -541,6 +541,70 @@ template<size_t i0, size_t i1, size_t i2, size_t i3> __device_inline const __m12
 }
 #endif
 
+/* Half Floats */
+
+#ifdef __KERNEL_OPENCL__
+
+__device_inline void float4_store_half(half *h, const float4 *f, float scale)
+{
+       vstore_half4(*f * scale, 0, h);
+}
+
+#else
+
+typedef unsigned short half;
+struct half4 { half x, y, z, w; };
+
+#ifdef __KERNEL_CUDA__
+
+__device_inline void float4_store_half(half *h, const float4 *f, float scale)
+{
+       h[0] = __float2half_rn(f->x * scale);
+       h[1] = __float2half_rn(f->y * scale);
+       h[2] = __float2half_rn(f->z * scale);
+       h[3] = __float2half_rn(f->w * scale);
+}
+
+#else
+
+__device_inline void float4_store_half(half *h, const float4 *f, float scale)
+{
+#ifndef __KERNEL_SSE2__
+       for(int i = 0; i < 4; i++) {
+               /* optimized float to half for pixels:
+                * assumes no negative, no nan, no inf, and sets denormal to 0 */
+               union { uint i; float f; } in;
+               in.f = ((*f)[i] > 0.0f)? (*f)[i] * scale: 0.0f;
+               int x = in.i;
+
+               int absolute = x & 0x7FFFFFFF;
+               int Z = absolute + 0xC8000000;
+               int result = (absolute < 0x38800000)? 0: Z;
+
+               h[i] = ((result >> 13) & 0x7FFF);
+       }
+#else
+       /* same as above with SSE */
+       const __m128 mm_scale = _mm_set_ps1(scale);
+       const __m128i mm_38800000 = _mm_set1_epi32(0x38800000);
+       const __m128i mm_7FFF = _mm_set1_epi32(0x7FFF);
+       const __m128i mm_7FFFFFFF = _mm_set1_epi32(0x7FFFFFFF);
+       const __m128i mm_C8000000 = _mm_set1_epi32(0xC8000000);
+
+       __m128i x = _mm_castps_si128(_mm_max_ps(_mm_mul_ps(*(__m128*)f, mm_scale), _mm_set_ps1(0.0f)));
+       __m128i absolute = _mm_and_si128(x, mm_7FFFFFFF);
+       __m128i Z = _mm_add_epi32(absolute, mm_C8000000);
+       __m128i result = _mm_andnot_si128(_mm_cmplt_epi32(absolute, mm_38800000), Z); 
+       __m128i rh = _mm_and_si128(_mm_srai_epi32(result, 13), mm_7FFF);
+
+       _mm_storel_pi((__m64*)h, _mm_castsi128_ps(_mm_packs_epi32(rh, rh)));
+#endif
+}
+
+#endif
+
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TYPES_H__ */