T61463: Separate Baking kernels
authorJeroen Bakker <j.bakker@atmind.nl>
Tue, 19 Feb 2019 15:31:31 +0000 (16:31 +0100)
committerJeroen Bakker <j.bakker@atmind.nl>
Tue, 19 Feb 2019 15:34:55 +0000 (16:34 +0100)
Cycles OpenCL: Split baking kernels in own program

Fix T61463. Before this patch baking was part of the base kernels. There
are 3 baking kernels that and all 3 uses shader evaluation. Only for one
of these kernels the functionality was wrapped in the __NO_BAKING__
compile directive.

When you start baking this leads to long compile times. By separating
in individual programs will reduce the compile times.

Also wrapped all baking kernels with __NO_BAKING__ to reduce the
compilation times.

Impact on compilation time

    job   |   scene_name    | previous |  new  | percentage
  --------+-----------------+----------+-------+------------
   T61463 | empty           |    10.63 |  7.27 |         32%
   T61463 | bmw             |    17.91 | 14.24 |         20%
   T61463 | fishycat        |    19.57 | 15.08 |         23%
   T61463 | barbershop      |    54.10 | 48.18 |         11%
   T61463 | classroom       |    17.55 | 14.42 |         18%
   T61463 | koro            |    18.92 | 17.15 |          9%
   T61463 | pavillion       |    17.43 | 14.23 |         18%
   T61463 | splash279       |    16.48 | 15.33 |          7%
   T61463 | volume_emission |    36.22 | 34.19 |          6%

Impact on render time

    job   |   scene_name    | previous |   new   | percentage
  --------+-----------------+----------+---------+------------
   T61463 | empty           |    21.06 |   20.54 |          2%
   T61463 | bmw             |   198.44 |  189.59 |          4%
   T61463 | fishycat        |   394.20 |  388.50 |          1%
   T61463 | barbershop      |  1188.16 | 1185.49 |          0%
   T61463 | classroom       |   341.08 |  339.27 |          1%
   T61463 | koro            |   472.43 |  360.70 |         24%
   T61463 | pavillion       |   905.77 |  902.14 |          0%
   T61463 | splash279       |    55.26 |   54.92 |          1%
   T61463 | volume_emission |    62.59 |   39.09 |         38%

I don't have a grounded explanation why koro and volume_emission is this much
faster; I have done several tests though...

Maniphest Tasks: T61463

Differential Revision: https://developer.blender.org/D4376

intern/cycles/device/opencl/opencl.h
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/device/opencl/opencl_split.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernels/opencl/kernel.cl
intern/cycles/kernel/kernels/opencl/kernel_background.cl [new file with mode: 0644]
intern/cycles/kernel/kernels/opencl/kernel_bake.cl [new file with mode: 0644]
intern/cycles/kernel/kernels/opencl/kernel_displace.cl [new file with mode: 0644]

index a2c0e53..766b9e4 100644 (file)
@@ -325,7 +325,11 @@ public:
                map<ustring, cl_kernel> kernels;
        };
 
-       OpenCLProgram base_program, denoising_program;
+       OpenCLProgram base_program;
+       OpenCLProgram bake_program;
+       OpenCLProgram displace_program;
+       OpenCLProgram background_program;
+       OpenCLProgram denoising_program;
 
        typedef map<string, device_vector<uchar>*> ConstMemMap;
        typedef map<string, device_ptr> MemMap;
@@ -571,7 +575,7 @@ protected:
                ustring key,
                thread_scoped_lock& cache_locker);
 
-       virtual string build_options_for_base_program(
+       virtual string build_options_for_bake_program(
                const DeviceRequestedFeatures& /*requested_features*/);
 
 private:
index d8f9a24..6a47a60 100644 (file)
@@ -162,6 +162,9 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
        }
 
        base_program.release();
+       bake_program.release();
+       displace_program.release();
+       background_program.release();
        if(cqCommandQueue)
                clReleaseCommandQueue(cqCommandQueue);
        if(cxContext)
@@ -225,14 +228,20 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
        if(!opencl_version_check())
                return false;
 
-       base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
+       base_program = OpenCLProgram(this, "base", "kernel.cl", "");
        base_program.add_kernel(ustring("convert_to_byte"));
        base_program.add_kernel(ustring("convert_to_half_float"));
-       base_program.add_kernel(ustring("displace"));
-       base_program.add_kernel(ustring("background"));
-       base_program.add_kernel(ustring("bake"));
        base_program.add_kernel(ustring("zero_buffer"));
 
+       bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", build_options_for_bake_program(requested_features));
+       bake_program.add_kernel(ustring("bake"));
+
+       displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", build_options_for_bake_program(requested_features));
+       displace_program.add_kernel(ustring("displace"));
+
+       background_program = OpenCLProgram(this, "background", "kernel_background.cl", build_options_for_bake_program(requested_features));
+       background_program.add_kernel(ustring("background"));
+
        denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
        denoising_program.add_kernel(ustring("filter_divide_shadow"));
        denoising_program.add_kernel(ustring("filter_get_feature"));
@@ -248,12 +257,15 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
        denoising_program.add_kernel(ustring("filter_finalize"));
 
        vector<OpenCLProgram*> programs;
-       programs.push_back(&base_program);
-       programs.push_back(&denoising_program);
+       programs.push_back(&bake_program);
+       programs.push_back(&displace_program);
+       programs.push_back(&background_program);
        /* Call actual class to fill the vector with its programs. */
        if(!add_kernel_programs(requested_features, programs)) {
                return false;
        }
+       programs.push_back(&base_program);
+       programs.push_back(&denoising_program);
 
        /* Parallel compilation of Cycles kernels, this launches multiple
         * processes to workaround OpenCL frameworks serializing the calls
@@ -1152,13 +1164,13 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
        cl_kernel kernel;
 
        if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-               kernel = base_program(ustring("bake"));
+               kernel = bake_program(ustring("bake"));
        }
        else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-               kernel = base_program(ustring("displace"));
+               kernel = displace_program(ustring("displace"));
        }
        else {
-               kernel = base_program(ustring("background"));
+               kernel = background_program(ustring("background"));
        }
 
        cl_uint start_arg_index =
@@ -1385,7 +1397,7 @@ void OpenCLDeviceBase::store_cached_kernel(
                                   cache_locker);
 }
 
-string OpenCLDeviceBase::build_options_for_base_program(
+string OpenCLDeviceBase::build_options_for_bake_program(
         const DeviceRequestedFeatures& requested_features)
 {
        /* TODO(sergey): By default we compile all features, meaning
index b759f69..c9d3eb2 100644 (file)
@@ -327,7 +327,7 @@ public:
 protected:
        /* ** Those guys are for workign around some compiler-specific bugs ** */
 
-       string build_options_for_base_program(
+       string build_options_for_bake_program(
                const DeviceRequestedFeatures& requested_features)
        {
                return requested_features.get_build_options();
index f7041ee..0a2acd3 100644 (file)
@@ -37,6 +37,9 @@ set(SRC_CUDA_KERNELS
 
 set(SRC_OPENCL_KERNELS
        kernels/opencl/kernel.cl
+       kernels/opencl/kernel_bake.cl
+       kernels/opencl/kernel_displace.cl
+       kernels/opencl/kernel_background.cl
        kernels/opencl/kernel_state_buffer_size.cl
        kernels/opencl/kernel_split.cl
        kernels/opencl/kernel_split_bundle.cl
index de1f508..aa837e2 100644 (file)
@@ -81,78 +81,6 @@ __kernel void kernel_ocl_path_trace(
 
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
 
-__kernel void kernel_ocl_displace(
-       ccl_constant KernelData *data,
-       ccl_global uint4 *input,
-       ccl_global float4 *output,
-
-       KERNEL_BUFFER_PARAMS,
-
-       int type, int sx, int sw, int offset, int sample)
-{
-       KernelGlobals kglobals, *kg = &kglobals;
-
-       kg->data = data;
-
-       kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-       kernel_set_buffer_info(kg);
-
-       int x = sx + ccl_global_id(0);
-
-       if(x < sx + sw) {
-               kernel_displace_evaluate(kg, input, output, x);
-       }
-}
-__kernel void kernel_ocl_background(
-       ccl_constant KernelData *data,
-       ccl_global uint4 *input,
-       ccl_global float4 *output,
-
-       KERNEL_BUFFER_PARAMS,
-
-       int type, int sx, int sw, int offset, int sample)
-{
-       KernelGlobals kglobals, *kg = &kglobals;
-
-       kg->data = data;
-
-       kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-       kernel_set_buffer_info(kg);
-
-       int x = sx + ccl_global_id(0);
-
-       if(x < sx + sw) {
-               kernel_background_evaluate(kg, input, output, x);
-       }
-}
-
-__kernel void kernel_ocl_bake(
-       ccl_constant KernelData *data,
-       ccl_global uint4 *input,
-       ccl_global float4 *output,
-
-       KERNEL_BUFFER_PARAMS,
-
-       int type, int filter, int sx, int sw, int offset, int sample)
-{
-       KernelGlobals kglobals, *kg = &kglobals;
-
-       kg->data = data;
-
-       kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-       kernel_set_buffer_info(kg);
-
-       int x = sx + ccl_global_id(0);
-
-       if(x < sx + sw) {
-#ifdef __NO_BAKING__
-               output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-#else
-               kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
-#endif
-       }
-}
-
 __kernel void kernel_ocl_convert_to_byte(
        ccl_constant KernelData *data,
        ccl_global uchar4 *rgba,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_background.cl
new file mode 100644 (file)
index 0000000..c7c709c
--- /dev/null
@@ -0,0 +1,39 @@
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_color.h"
+#include "kernel/kernels/opencl/kernel_opencl_image.h"
+
+#include "kernel/kernel_path.h"
+#include "kernel/kernel_path_branched.h"
+
+#include "kernel/kernel_bake.h"
+
+__kernel void kernel_ocl_background(
+       ccl_constant KernelData *data,
+       ccl_global uint4 *input,
+       ccl_global float4 *output,
+
+       KERNEL_BUFFER_PARAMS,
+
+       int type, int sx, int sw, int offset, int sample)
+{
+       KernelGlobals kglobals, *kg = &kglobals;
+
+       kg->data = data;
+
+       kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+       kernel_set_buffer_info(kg);
+
+       int x = sx + ccl_global_id(0);
+
+       if(x < sx + sw) {
+#ifdef __NO_BAKING__
+               output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+               kernel_background_evaluate(kg, input, output, x);
+#endif
+       }
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl b/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
new file mode 100644 (file)
index 0000000..041312b
--- /dev/null
@@ -0,0 +1,38 @@
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_color.h"
+#include "kernel/kernels/opencl/kernel_opencl_image.h"
+
+#include "kernel/kernel_path.h"
+#include "kernel/kernel_path_branched.h"
+
+#include "kernel/kernel_bake.h"
+
+__kernel void kernel_ocl_bake(
+       ccl_constant KernelData *data,
+       ccl_global uint4 *input,
+       ccl_global float4 *output,
+
+       KERNEL_BUFFER_PARAMS,
+
+       int type, int filter, int sx, int sw, int offset, int sample)
+{
+       KernelGlobals kglobals, *kg = &kglobals;
+
+       kg->data = data;
+
+       kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+       kernel_set_buffer_info(kg);
+
+       int x = sx + ccl_global_id(0);
+
+       if(x < sx + sw) {
+#ifdef __NO_BAKING__
+               output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+               kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+#endif
+       }
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl b/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
new file mode 100644 (file)
index 0000000..288bfd5
--- /dev/null
@@ -0,0 +1,40 @@
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_color.h"
+#include "kernel/kernels/opencl/kernel_opencl_image.h"
+
+#include "kernel/kernel_path.h"
+#include "kernel/kernel_path_branched.h"
+
+#include "kernel/kernel_bake.h
+
+__kernel void kernel_ocl_displace(
+       ccl_constant KernelData *data,
+       ccl_global uint4 *input,
+       ccl_global float4 *output,
+
+       KERNEL_BUFFER_PARAMS,
+
+       int type, int sx, int sw, int offset, int sample)
+{
+       KernelGlobals kglobals, *kg = &kglobals;
+
+       kg->data = data;
+
+       kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+       kernel_set_buffer_info(kg);
+
+       int x = sx + ccl_global_id(0);
+
+       if(x < sx + sw) {
+#ifdef __NO_BAKING__
+               output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+               kernel_displace_evaluate(kg, input, output, x);
+#endif
+       }
+}
+