Cycles: Optionally output luminance from the shader evaluation kernel
authorSergey Sharybin <sergey.vfx@gmail.com>
Wed, 30 Dec 2015 14:04:01 +0000 (19:04 +0500)
committerSergey Sharybin <sergey.vfx@gmail.com>
Wed, 30 Dec 2015 14:04:04 +0000 (19:04 +0500)
This makes it possible to move some parts of evaluation from host to the device
and hopefully reduce memory usage by avoid having full RGBA buffer on the host.

Reviewers: juicyfruit, lukasstockner97, brecht

Reviewed By: lukasstockner97, brecht

Differential Revision: https://developer.blender.org/D1702

13 files changed:
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_multi.cpp
intern/cycles/device/device_network.cpp
intern/cycles/device/device_network.h
intern/cycles/device/device_opencl.cpp
intern/cycles/device/device_task.cpp
intern/cycles/device/device_task.h
intern/cycles/kernel/kernel_bake.h
intern/cycles/kernel/kernels/cpu/kernel_cpu.h
intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
intern/cycles/kernel/kernels/cuda/kernel.cu
intern/cycles/kernel/kernels/opencl/kernel.cl

index f06963c146ea401eec38102bd60fe7f6bfbecc46..832f4d1c1fd49ebaa0e26e2206828849c3ba02ee 100644 (file)
@@ -343,7 +343,7 @@ public:
 #ifdef WITH_OSL
                OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-               void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int);
+               void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
                if(system_cpu_support_avx2())
@@ -374,8 +374,14 @@ public:
 
                for(int sample = 0; sample < task.num_samples; sample++) {
                        for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-                               shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-                                       task.shader_eval_type, x, task.offset, sample);
+                               shader_kernel(&kg,
+                                             (uint4*)task.shader_input,
+                                             (float4*)task.shader_output,
+                                             (float*)task.shader_output_luma,
+                                             task.shader_eval_type,
+                                             x,
+                                             task.offset,
+                                             sample);
 
                        if(task.get_cancel() || task_pool.canceled())
                                break;
index d9d6fd77ecb315331b1f47503197ba6b43ff18b3..5c9ca3454c62ad0fec92bf5e00b27243d3de90c8 100644 (file)
@@ -726,6 +726,7 @@ public:
                CUfunction cuShader;
                CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
                CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
+               CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
 
                /* get kernel function */
                if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
@@ -747,13 +748,18 @@ public:
                                int shader_w = min(shader_chunk_size, end - shader_x);
 
                                /* pass in parameters */
-                               void *args[] = {&d_input,
-                                                                &d_output,
-                                                                &task.shader_eval_type,
-                                                                &shader_x,
-                                                                &shader_w,
-                                                                &offset,
-                                                                &sample};
+                               void *args[8];
+                               int arg = 0;
+                               args[arg++] = &d_input;
+                               args[arg++] = &d_output;
+                               if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+                                       args[arg++] = &d_output_luma;
+                               }
+                               args[arg++] = &task.shader_eval_type;
+                               args[arg++] = &shader_x;
+                               args[arg++] = &shader_w;
+                               args[arg++] = &offset;
+                               args[arg++] = &sample;
 
                                /* launch kernel */
                                int threads_per_block;
index 8fb841b2b0d43ffdeb816b09e324b8ccf50176c2..069305e8a292cae9a0945744eb3894d8bcdb7189 100644 (file)
@@ -316,6 +316,7 @@ public:
                                if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
                                if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
                                if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
+                               if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma];
 
                                sub.device->task_add(subtask);
                        }
index afa35224aba5ac63fc48087144b2f72c614481ba..23faa61e8e52e6a89c1442460669a37784dd5be5 100644 (file)
@@ -648,6 +648,9 @@ protected:
                        if(task.shader_output)
                                task.shader_output = device_ptr_from_client_pointer(task.shader_output);
 
+                       if(task.shader_output)luma)
+                               task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma);
+
 
                        task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
                        task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
index 2e751f6697f7b410957d537408641e82a4ad687e..60ecc1d0a86a771f0f14e984791dd29a34ef1cb4 100644 (file)
@@ -132,7 +132,7 @@ public:
                archive & type & task.x & task.y & task.w & task.h;
                archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
                archive & task.offset & task.stride;
-               archive & task.shader_input & task.shader_output & task.shader_eval_type;
+               archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
                archive & task.shader_x & task.shader_w;
                archive & task.need_finish_queue;
        }
@@ -291,7 +291,7 @@ public:
                *archive & type & task.x & task.y & task.w & task.h;
                *archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
                *archive & task.offset & task.stride;
-               *archive & task.shader_input & task.shader_output & task.shader_eval_type;
+               *archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
                *archive & task.shader_x & task.shader_w;
                *archive & task.need_finish_queue;
 
index e0c602461ed3bb30ed18b31d424ad3dab3ab7314..a1743f53831db143abaf9224165fd0f27f9f571e 100644 (file)
@@ -1304,6 +1304,7 @@ public:
                cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
                cl_mem d_input = CL_MEM_PTR(task.shader_input);
                cl_mem d_output = CL_MEM_PTR(task.shader_output);
+               cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
                cl_int d_shader_eval_type = task.shader_eval_type;
                cl_int d_shader_x = task.shader_x;
                cl_int d_shader_w = task.shader_w;
@@ -1330,6 +1331,12 @@ public:
                                                d_input,
                                                d_output);
 
+               if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+                       start_arg_index += kernel_set_args(kernel,
+                                                          start_arg_index,
+                                                          d_output_luma);
+               }
+
 #define KERNEL_TEX(type, ttype, name) \
                set_kernel_arg_mem(kernel, &start_arg_index, #name);
 #include "kernel_textures.h"
index d527540f300a3cad4c16810f0280593964fa740b..0cae118a6923b71e95c6e7695b750232a18b1deb 100644 (file)
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 DeviceTask::DeviceTask(Type type_)
 : type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
   sample(0), num_samples(1),
-  shader_input(0), shader_output(0),
+  shader_input(0), shader_output(0), shader_output_luma(0),
   shader_eval_type(0), shader_x(0), shader_w(0)
 {
        last_update_time = time_dt();
index 834ea60988a8c798444a497ccacff23ebe93c77f..7654508d4a569acca493dab2261186ccf28c8f94 100644 (file)
@@ -46,7 +46,7 @@ public:
        int offset, stride;
 
        device_ptr shader_input;
-       device_ptr shader_output;
+       device_ptr shader_output, shader_output_luma;
        int shader_eval_type;
        int shader_x, shader_w;
 
index 715c11c7ea0f7475dc1071c917081a71e520e8fc..b54afbd21b8c7f67fe9e92c18d043d9c9ddb63b8 100644 (file)
@@ -453,7 +453,13 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
                output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
 }
 
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
+                                       ccl_global uint4 *input,
+                                       ccl_global float4 *output,
+                                       ccl_global float *output_luma,
+                                       ShaderEvalType type,
+                                       int i,
+                                       int sample)
 {
        ShaderData sd;
        uint4 in = input[i];
@@ -500,10 +506,22 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
        }
        
        /* write output */
-       if(sample == 0)
-               output[i] = make_float4(out.x, out.y, out.z, 0.0f);
-       else
-               output[i] += make_float4(out.x, out.y, out.z, 0.0f);
+       if(sample == 0) {
+               if(output != NULL) {
+                       output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+               }
+               if(output_luma != NULL) {
+                       output_luma[i] = average(out);
+               }
+       }
+       else {
+               if(output != NULL) {
+                       output[i] += make_float4(out.x, out.y, out.z, 0.0f);
+               }
+               if(output_luma != NULL) {
+                       output_luma[i] += average(out);
+               }
+       }
 }
 
 CCL_NAMESPACE_END
index 2560c6d8dee8e8da6cae7d5673399026ef8afa9d..1ce1e41272b4120abf6fac3e3081fe4146761645 100644 (file)
@@ -42,6 +42,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
+                                       float *output_luma,
                                        int type,
                                        int i,
                                        int offset,
index 693285ec3a8365723465749f67be30baed061b16..0249610b381ef9dbd97af00b3910a3e41a27e5f8 100644 (file)
@@ -99,12 +99,14 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
+                                       float *output_luma,
                                        int type,
                                        int i,
                                        int offset,
                                        int sample)
 {
        if(type >= SHADER_EVAL_BAKE) {
+               kernel_assert(output_luma == NULL);
                kernel_bake_evaluate(kg,
                                     input,
                                     output,
@@ -117,6 +119,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                kernel_shader_evaluate(kg,
                                       input,
                                       output,
+                                      output_luma,
                                       (ShaderEvalType)type,
                                       i,
                                       sample);
index 3929b676f0708351b81894d85a6071bf56b41a6b..e094612de01ed7d9b722e370006d53bb1cad4b32 100644 (file)
@@ -159,12 +159,26 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+kernel_cuda_shader(uint4 *input,
+                   float4 *output,
+                   float *output_luma,
+                   int type,
+                   int sx,
+                   int sw,
+                   int offset,
+                   int sample)
 {
        int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
-       if(x < sx + sw)
-               kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+       if(x < sx + sw) {
+               kernel_shader_evaluate(NULL,
+                                      input,
+                                      output,
+                                      output_luma,
+                                      (ShaderEvalType)type, 
+                                      x,
+                                      sample);
+       }
 }
 
 extern "C" __global__ void
index 57db6fd9098a1225f6b6c17dbe279fbc89cd894a..4c9f7ba1d7c3a7d12f52b29b3b076968a4026390 100644 (file)
@@ -61,6 +61,7 @@ __kernel void kernel_ocl_shader(
        ccl_constant KernelData *data,
        ccl_global uint4 *input,
        ccl_global float4 *output,
+       ccl_global float *output_luma,
 
 #define KERNEL_TEX(type, ttype, name) \
        ccl_global type *name,
@@ -78,8 +79,15 @@ __kernel void kernel_ocl_shader(
 
        int x = sx + get_global_id(0);
 
-       if(x < sx + sw)
-               kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+       if(x < sx + sw) {
+               kernel_shader_evaluate(kg,
+                                      input,
+                                      output,
+                                      output_luma,
+                                      (ShaderEvalType)type,
+                                      x,
+                                      sample);
+       }
 }
 
 __kernel void kernel_ocl_bake(