Cycles: prefilter feature passes separate from denoising.
authorLukas Stockner <lukas.stockner@freenet.de>
Wed, 6 Feb 2019 11:42:10 +0000 (12:42 +0100)
committerBrecht Van Lommel <brechtvanlommel@gmail.com>
Wed, 6 Feb 2019 14:18:29 +0000 (15:18 +0100)
Prefiltering of feature passes will happen during rendering, which can
then be used for denoising immediately or written as a render pass for
later (animation) denoising.

The number of denoising data passes written is reduced because of this,
leaving out the feature variance passes. The passes are now Normal,
Albedo, Depth, Shadowing, Variance and Intensity.

Ref D3889.

26 files changed:
intern/cycles/blender/addon/engine.py
intern/cycles/blender/blender_session.cpp
intern/cycles/blender/blender_sync.cpp
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_denoising.cpp
intern/cycles/device/device_denoising.h
intern/cycles/device/device_task.h
intern/cycles/device/opencl/opencl.h
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/kernel/filter/filter_defines.h
intern/cycles/kernel/filter/filter_nlm_cpu.h
intern/cycles/kernel/filter/filter_nlm_gpu.h
intern/cycles/kernel/filter/filter_prefilter.h
intern/cycles/kernel/filter/filter_reconstruction.h
intern/cycles/kernel/kernel_types.h
intern/cycles/kernel/kernels/cpu/filter_cpu.h
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
intern/cycles/kernel/kernels/cuda/filter.cu
intern/cycles/kernel/kernels/opencl/filter.cl
intern/cycles/render/buffers.cpp
intern/cycles/render/buffers.h
intern/cycles/render/film.cpp
intern/cycles/render/film.h
intern/cycles/render/session.cpp
intern/cycles/render/session.h

index 23239ee43522462d2c6df8fed4268b8a99a21f34..83b9a8eee0c8d79c8288462bd87632fd33c20abb 100644 (file)
@@ -269,14 +269,11 @@ def register_passes(engine, scene, srl):
         engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR')
         if crl.denoising_store_passes:
             engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR')
-            engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
             engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR')
-            engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR')
             engine.register_pass(scene, srl, "Denoising Depth",           1, "Z",   'VALUE')
-            engine.register_pass(scene, srl, "Denoising Depth Variance",  1, "Z",   'VALUE')
-            engine.register_pass(scene, srl, "Denoising Shadow A",        3, "XYV", 'VECTOR')
-            engine.register_pass(scene, srl, "Denoising Shadow B",        3, "XYV", 'VECTOR')
-            engine.register_pass(scene, srl, "Denoising Image Variance",  3, "RGB", 'COLOR')
+            engine.register_pass(scene, srl, "Denoising Shadowing",       1, "X",   'VALUE')
+            engine.register_pass(scene, srl, "Denoising Variance",        3, "RGB", 'COLOR')
+            engine.register_pass(scene, srl, "Denoising Intensity",       1, "X",   'VALUE')
             clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
                              "denoising_glossy_direct", "denoising_glossy_indirect",
                              "denoising_transmission_direct", "denoising_transmission_indirect",
index dfa92dd1bc70da53d31678d7a3605851c0cc54f1..50ac35069a9775c5380216861716bb64723a95d7 100644 (file)
@@ -418,15 +418,19 @@ void BlenderSession::render()
                buffer_params.passes = passes;
 
                PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
-               bool use_denoising = get_boolean(crl, "use_denoising");
-               bool denoising_passes = use_denoising || get_boolean(crl, "denoising_store_passes");
+               bool full_denoising = get_boolean(crl, "use_denoising");
+               bool write_denoising_passes = get_boolean(crl, "denoising_store_passes");
 
-               session->tile_manager.schedule_denoising = use_denoising;
-               buffer_params.denoising_data_pass = denoising_passes;
+               bool run_denoising = full_denoising || write_denoising_passes;
+
+               session->tile_manager.schedule_denoising = run_denoising;
+               buffer_params.denoising_data_pass = run_denoising;
                buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+               buffer_params.denoising_prefiltered_pass = write_denoising_passes;
 
-               session->params.use_denoising = use_denoising;
-               session->params.denoising_passes = denoising_passes;
+               session->params.run_denoising = run_denoising;
+               session->params.full_denoising = full_denoising;
+               session->params.write_denoising_passes = write_denoising_passes;
                session->params.denoising_radius = get_int(crl, "denoising_radius");
                session->params.denoising_strength = get_float(crl, "denoising_strength");
                session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
@@ -434,6 +438,7 @@ void BlenderSession::render()
 
                scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
                scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass;
+               scene->film->denoising_prefiltered_pass = buffer_params.denoising_prefiltered_pass;
                scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
                scene->film->tag_passes_update(scene, passes);
                scene->film->tag_update(scene);
index 703fcc2078bdf9a21d4f4dce476a9319b5cfdbab..a6050b660403bcf0d62abe4d5dfe84c63913610c 100644 (file)
@@ -531,7 +531,7 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
 {
        string name = b_pass.name();
 
-       if(name == "Noisy Image") return DENOISING_PASS_COLOR;
+       if(name == "Noisy Image") return DENOISING_PASS_PREFILTERED_COLOR;
 
        if(name.substr(0, 10) != "Denoising ") {
                return -1;
@@ -539,15 +539,12 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
        name = name.substr(10);
 
 #define MAP_PASS(passname, offset) if(name == passname) return offset;
-       MAP_PASS("Normal", DENOISING_PASS_NORMAL);
-       MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
-       MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
-       MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
-       MAP_PASS("Depth", DENOISING_PASS_DEPTH);
-       MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
-       MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
-       MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
-       MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+       MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL);
+       MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO);
+       MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH);
+       MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING);
+       MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE);
+       MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY);
        MAP_PASS("Clean", DENOISING_PASS_CLEAN);
 #undef MAP_PASS
 
@@ -579,10 +576,11 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
        }
 
        PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
-       bool use_denoising = get_boolean(crp, "use_denoising");
-       bool store_denoising_passes = get_boolean(crp, "denoising_store_passes");
+       bool full_denoising = get_boolean(crp, "use_denoising");
+       bool write_denoising_passes = get_boolean(crp, "denoising_store_passes");
+
        scene->film->denoising_flags = 0;
-       if(use_denoising || store_denoising_passes) {
+       if(full_denoising || write_denoising_passes) {
 #define MAP_OPTION(name, flag) if(!get_boolean(crp, name)) scene->film->denoising_flags |= flag;
                MAP_OPTION("denoising_diffuse_direct",        DENOISING_CLEAN_DIFFUSE_DIR);
                MAP_OPTION("denoising_diffuse_indirect",      DENOISING_CLEAN_DIFFUSE_IND);
@@ -596,16 +594,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
                b_engine.add_pass("Noisy Image", 4, "RGBA", b_srlay.name().c_str());
        }
 
-       if(store_denoising_passes) {
+       if(write_denoising_passes) {
                b_engine.add_pass("Denoising Normal",          3, "XYZ", b_srlay.name().c_str());
-               b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
                b_engine.add_pass("Denoising Albedo",          3, "RGB", b_srlay.name().c_str());
-               b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
                b_engine.add_pass("Denoising Depth",           1, "Z",   b_srlay.name().c_str());
-               b_engine.add_pass("Denoising Depth Variance",  1, "Z",   b_srlay.name().c_str());
-               b_engine.add_pass("Denoising Shadow A",        3, "XYV", b_srlay.name().c_str());
-               b_engine.add_pass("Denoising Shadow B",        3, "XYV", b_srlay.name().c_str());
-               b_engine.add_pass("Denoising Image Variance",  3, "RGB", b_srlay.name().c_str());
+               b_engine.add_pass("Denoising Shadowing",       1, "X",   b_srlay.name().c_str());
+               b_engine.add_pass("Denoising Variance",        3, "RGB", b_srlay.name().c_str());
+               b_engine.add_pass("Denoising Intensity",       1, "X",   b_srlay.name().c_str());
 
                if(scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) {
                        b_engine.add_pass("Denoising Clean",   3, "RGB", b_srlay.name().c_str());
index 16908b0244ad69d8f28b52f48cb70762f812528a..6668acc9cbe3b530ff384a16e8aaee896e16b612 100644 (file)
@@ -180,16 +180,17 @@ public:
        KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
        KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>   shader_kernel;
 
-       KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
-       KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, int*, int, int)>               filter_get_feature_kernel;
+       KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)>  filter_divide_shadow_kernel;
+       KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)>         filter_get_feature_kernel;
+       KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)>                                    filter_write_feature_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
 
-       KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)>   filter_nlm_calc_difference_kernel;
-       KernelFunctions<void(*)(float*, float*, int*, int, int)>                                   filter_nlm_blur_kernel;
-       KernelFunctions<void(*)(float*, float*, int*, int, int)>                                   filter_nlm_calc_weight_kernel;
-       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
-       KernelFunctions<void(*)(float*, float*, int*, int)>                                        filter_nlm_normalize_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)>      filter_nlm_calc_difference_kernel;
+       KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_blur_kernel;
+       KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_calc_weight_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)>       filter_nlm_update_output_kernel;
+       KernelFunctions<void(*)(float*, float*, int*, int)>                                                   filter_nlm_normalize_kernel;
 
        KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
@@ -218,6 +219,7 @@ public:
          REGISTER_KERNEL(shader),
          REGISTER_KERNEL(filter_divide_shadow),
          REGISTER_KERNEL(filter_get_feature),
+         REGISTER_KERNEL(filter_write_feature),
          REGISTER_KERNEL(filter_detect_outliers),
          REGISTER_KERNEL(filter_combine_halves),
          REGISTER_KERNEL(filter_nlm_calc_difference),
@@ -487,6 +489,8 @@ public:
 
                int w = align_up(rect.z-rect.x, 4);
                int h = rect.w-rect.y;
+               int stride = task->buffer.stride;
+               int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
 
                float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
                float *blurDifference = temporary_mem;
@@ -504,9 +508,10 @@ public:
                        filter_nlm_calc_difference_kernel()(dx, dy,
                                                            (float*) guide_ptr,
                                                            (float*) variance_ptr,
+                                                           NULL,
                                                            difference,
                                                            local_rect,
-                                                           w, 0,
+                                                           w, channel_offset,
                                                            a, k_2);
 
                        filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
@@ -520,7 +525,8 @@ public:
                                                          (float*) out_ptr,
                                                          weightAccum,
                                                          local_rect,
-                                                         w, f);
+                                                         channel_offset,
+                                                         stride, f);
                }
 
                int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
@@ -550,16 +556,13 @@ public:
                return true;
        }
 
-       bool denoising_reconstruct(device_ptr color_ptr,
-                                  device_ptr color_variance_ptr,
-                                  device_ptr output_ptr,
-                                  DenoisingTask *task)
+       bool denoising_accumulate(device_ptr color_ptr,
+                                 device_ptr color_variance_ptr,
+                                 device_ptr scale_ptr,
+                                 DenoisingTask *task)
        {
                ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
 
-               mem_zero(task->storage.XtWX);
-               mem_zero(task->storage.XtWY);
-
                float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
                float *difference     = temporary_mem;
                float *blurDifference = temporary_mem + task->buffer.pass_stride;
@@ -575,6 +578,7 @@ public:
                        filter_nlm_calc_difference_kernel()(dx, dy,
                                                            (float*) color_ptr,
                                                            (float*) color_variance_ptr,
+                                                           (float*) scale_ptr,
                                                            difference,
                                                            local_rect,
                                                            task->buffer.stride,
@@ -597,6 +601,13 @@ public:
                                                              4,
                                                              task->buffer.pass_stride);
                }
+
+               return true;
+       }
+
+       bool denoising_solve(device_ptr output_ptr,
+                            DenoisingTask *task)
+       {
                for(int y = 0; y < task->filter_area.w; y++) {
                        for(int x = 0; x < task->filter_area.z; x++) {
                                filter_finalize_kernel()(x,
@@ -661,6 +672,7 @@ public:
                                   int variance_offset,
                                   device_ptr mean_ptr,
                                   device_ptr variance_ptr,
+                                  float scale,
                                   DenoisingTask *task)
        {
                ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
@@ -674,6 +686,7 @@ public:
                                                            x, y,
                                                            (float*) mean_ptr,
                                                            (float*) variance_ptr,
+                                                           scale,
                                                            &task->rect.x,
                                                            task->render_buffer.pass_stride,
                                                            task->render_buffer.offset);
@@ -682,6 +695,26 @@ public:
                return true;
        }
 
+       bool denoising_write_feature(int out_offset,
+                                    device_ptr from_ptr,
+                                    device_ptr buffer_ptr,
+                                    DenoisingTask *task)
+       {
+               for(int y = 0; y < task->filter_area.w; y++) {
+                       for(int x = 0; x < task->filter_area.z; x++) {
+                               filter_write_feature_kernel()(task->render_buffer.samples,
+                                                             x + task->filter_area.x,
+                                                             y + task->filter_area.y,
+                                                             &task->reconstruction_state.buffer_params.x,
+                                                             (float*) from_ptr,
+                                                             (float*) buffer_ptr,
+                                                             out_offset,
+                                                             &task->rect.x);
+                       }
+               }
+               return true;
+       }
+
        bool denoising_detect_outliers(device_ptr image_ptr,
                                       device_ptr variance_ptr,
                                       device_ptr depth_ptr,
@@ -754,11 +787,13 @@ public:
                tile.sample = tile.start_sample + tile.num_samples;
 
                denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
-               denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+               denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+               denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
                denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
                denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
                denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-               denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+               denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+               denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
                denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
 
                denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
index 7b3c25a86d553ea522ca5d9d50cd7f4aac441ca3..cb7d8bbb2248d5de08abe0b65a0745e7a5ec81c1 100644 (file)
@@ -1300,7 +1300,7 @@ public:
 
                int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
-               int channel_offset = 0;
+               int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
 
                if(have_error())
                        return false;
@@ -1308,6 +1308,7 @@ public:
                CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
                CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
                CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts;
+               CUdeviceptr scale_ptr = 0;
 
                cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride));
                cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride));
@@ -1326,10 +1327,10 @@ public:
 
                        CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
 
-                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
                        void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
                        void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-                       void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f};
+                       void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
 
                        CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
                        CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1379,19 +1380,16 @@ public:
                return !have_error();
        }
 
-       bool denoising_reconstruct(device_ptr color_ptr,
-                                  device_ptr color_variance_ptr,
-                                  device_ptr output_ptr,
-                                  DenoisingTask *task)
+       bool denoising_accumulate(device_ptr color_ptr,
+                                 device_ptr color_variance_ptr,
+                                 device_ptr scale_ptr,
+                                 DenoisingTask *task)
        {
                if(have_error())
                        return false;
 
                CUDAContextScope scope(this);
 
-               mem_zero(task->storage.XtWX);
-               mem_zero(task->storage.XtWY);
-
                int r = task->radius;
                int f = 4;
                float a = 1.0f;
@@ -1410,60 +1408,69 @@ public:
                CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
                CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
 
-               {
-                       CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-                       cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-                       CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                                            task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-                                            num_shifts);
-
-                       void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
-                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-                       void *construct_gramian_args[] = {&blurDifference,
-                                                         &task->buffer.mem.device_pointer,
-                                                         &task->storage.transform.device_pointer,
-                                                         &task->storage.rank.device_pointer,
-                                                         &task->storage.XtWX.device_pointer,
-                                                         &task->storage.XtWY.device_pointer,
-                                                         &task->reconstruction_state.filter_window,
-                                                         &w, &h, &stride,
-                                                         &pass_stride, &r,
-                                                         &f};
-
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-               }
+               CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+               cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+               cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+               cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+               cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+               CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                                    task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+                                    num_shifts);
+
+               void *calc_difference_args[] = {&color_ptr,
+                                               &color_variance_ptr,
+                                               &scale_ptr,
+                                               &difference,
+                                               &w, &h,
+                                               &stride, &pass_stride,
+                                               &r, &pass_stride,
+                                               &a, &k_2};
+               void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+               void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+               void *construct_gramian_args[] = {&blurDifference,
+                                                 &task->buffer.mem.device_pointer,
+                                                 &task->storage.transform.device_pointer,
+                                                 &task->storage.rank.device_pointer,
+                                                 &task->storage.XtWX.device_pointer,
+                                                 &task->storage.XtWY.device_pointer,
+                                                 &task->reconstruction_state.filter_window,
+                                                 &w, &h, &stride,
+                                                 &pass_stride, &r,
+                                                 &f};
+
+               CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+               CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+               CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+               CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+               CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+               cuda_assert(cuCtxSynchronize());
 
-               {
-                       CUfunction cuFinalize;
-                       cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-                       cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-                       void *finalize_args[] = {&output_ptr,
-                                                        &task->storage.rank.device_pointer,
-                                                        &task->storage.XtWX.device_pointer,
-                                                        &task->storage.XtWY.device_pointer,
-                                                        &task->filter_area,
-                                                        &task->reconstruction_state.buffer_params.x,
-                                                        &task->render_buffer.samples};
-                       CUDA_GET_BLOCKSIZE(cuFinalize,
-                                          task->reconstruction_state.source_w,
-                                          task->reconstruction_state.source_h);
-                       CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-               }
+               return !have_error();
+       }
 
+       bool denoising_solve(device_ptr output_ptr,
+                            DenoisingTask *task)
+       {
+               CUfunction cuFinalize;
+               cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+               cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+               void *finalize_args[] = {&output_ptr,
+                                        &task->storage.rank.device_pointer,
+                                        &task->storage.XtWX.device_pointer,
+                                        &task->storage.XtWY.device_pointer,
+                                        &task->filter_area,
+                                        &task->reconstruction_state.buffer_params.x,
+                                        &task->render_buffer.samples};
+               CUDA_GET_BLOCKSIZE(cuFinalize,
+                                  task->reconstruction_state.source_w,
+                                  task->reconstruction_state.source_h);
+               CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
                cuda_assert(cuCtxSynchronize());
 
                return !have_error();
@@ -1533,6 +1540,7 @@ public:
                                   int variance_offset,
                                   device_ptr mean_ptr,
                                   device_ptr variance_ptr,
+                                  float scale,
                                   DenoisingTask *task)
        {
                if(have_error())
@@ -1553,6 +1561,7 @@ public:
                                &variance_offset,
                                &mean_ptr,
                                &variance_ptr,
+                               &scale,
                                &task->rect,
                                &task->render_buffer.pass_stride,
                                &task->render_buffer.offset};
@@ -1562,6 +1571,36 @@ public:
                return !have_error();
        }
 
+       bool denoising_write_feature(int out_offset,
+                                    device_ptr from_ptr,
+                                    device_ptr buffer_ptr,
+                                    DenoisingTask *task)
+       {
+               if(have_error())
+                       return false;
+
+               CUDAContextScope scope(this);
+
+               CUfunction cuFilterWriteFeature;
+               cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+               cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+               CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
+                                  task->filter_area.z,
+                                  task->filter_area.w);
+
+               void *args[] = {&task->render_buffer.samples,
+                               &task->reconstruction_state.buffer_params,
+                               &task->filter_area,
+                               &from_ptr,
+                               &buffer_ptr,
+                               &out_offset,
+                               &task->rect};
+               CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+               cuda_assert(cuCtxSynchronize());
+
+               return !have_error();
+       }
+
        bool denoising_detect_outliers(device_ptr image_ptr,
                                       device_ptr variance_ptr,
                                       device_ptr depth_ptr,
@@ -1596,11 +1635,13 @@ public:
        void denoise(RenderTile &rtile, DenoisingTask& denoising)
        {
                denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
-               denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+               denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+               denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
                denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
                denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
                denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-               denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+               denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+               denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
                denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
 
                denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
index 433cbd3c265a3a72e156e70d1f7c36a9f41173e8..724171c3acba82bbd23319e187d9d0c8f26f4621 100644 (file)
@@ -39,11 +39,18 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
        render_buffer.pass_stride = task.pass_stride;
        render_buffer.offset = task.pass_denoising_data;
 
-       target_buffer.pass_stride = task.pass_stride;
+       target_buffer.pass_stride = task.target_pass_stride;
        target_buffer.denoising_clean_offset = task.pass_denoising_clean;
+       target_buffer.offset = 0;
 
        functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
        functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
+
+       tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
+       tile_info->from_render = task.denoising_from_render? 1 : 0;
+
+       write_passes = task.denoising_write_passes;
+       do_filter = task.denoising_do_filter;
 }
 
 DenoisingTask::~DenoisingTask()
@@ -59,8 +66,6 @@ DenoisingTask::~DenoisingTask()
 
 void DenoisingTask::set_render_buffer(RenderTile *rtiles)
 {
-       tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
-
        for(int i = 0; i < 9; i++) {
                tile_info->offsets[i] = rtiles[i].offset;
                tile_info->strides[i] = rtiles[i].stride;
@@ -79,6 +84,13 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
        target_buffer.stride = rtiles[9].stride;
        target_buffer.ptr    = rtiles[9].buffer;
 
+       if(write_passes && rtiles[9].buffers) {
+               target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+       }
+       else {
+               target_buffer.denoising_output_offset = 0;
+       }
+
        tile_info_mem.copy_to_device();
 }
 
@@ -89,7 +101,8 @@ void DenoisingTask::setup_denoising_buffer()
        rect = rect_expand(rect, radius);
        rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
 
-       buffer.passes = 14;
+       buffer.use_intensity = write_passes;
+       buffer.passes = buffer.use_intensity? 15 : 14;
        buffer.width = rect.z - rect.x;
        buffer.stride = align_up(buffer.width, 4);
        buffer.h = rect.w - rect.y;
@@ -129,14 +142,14 @@ void DenoisingTask::prefilter_shadowing()
        functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
 
        /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
-       nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+       nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
        functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
 
        /* Reuse memory, the previous data isn't needed anymore. */
        device_ptr filtered_a = *buffer_var,
                   filtered_b = *sample_var;
        /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
-       nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+       nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
        functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
        functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
 
@@ -147,7 +160,7 @@ void DenoisingTask::prefilter_shadowing()
        device_ptr final_a = *unfiltered_a,
                   final_b = *unfiltered_b;
        /* Use the residual variance for a second filter pass. */
-       nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+       nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
        functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
        functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
 
@@ -167,9 +180,9 @@ void DenoisingTask::prefilter_features()
        for(int pass = 0; pass < 7; pass++) {
                device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride);
                /* Get the unfiltered pass and its variance from the RenderBuffers. */
-               functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+               functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples);
                /* Smooth the pass and store the result in the denoising buffers. */
-               nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+               nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
                functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
        }
 }
@@ -188,13 +201,33 @@ void DenoisingTask::prefilter_color()
        for(int pass = 0; pass < num_color_passes; pass++) {
                device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride);
                device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride);
-               functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+               functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples);
        }
 
        device_sub_ptr depth_pass    (buffer.mem,                                 0,   buffer.pass_stride);
        device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
        device_sub_ptr output_pass   (buffer.mem,     mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
        functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+
+       if(buffer.use_intensity) {
+               device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+               nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true);
+               functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
+       }
+}
+
+void DenoisingTask::write_buffer()
+{
+       reconstruction_state.buffer_params = make_int4(target_buffer.offset,
+                                                      target_buffer.stride,
+                                                      target_buffer.pass_stride,
+                                                      target_buffer.denoising_clean_offset);
+       int num_passes = buffer.use_intensity? 15 : 14;
+       for(int pass = 0; pass < num_passes; pass++) {
+               device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride);
+               int out_offset = pass + target_buffer.denoising_output_offset;
+               functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
+       }
 }
 
 void DenoisingTask::construct_transform()
@@ -212,6 +245,8 @@ void DenoisingTask::reconstruct()
 {
        storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
        storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
+       storage.XtWX.zero_to_device();
+       storage.XtWY.zero_to_device();
 
        reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
        int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x;
@@ -224,7 +259,12 @@ void DenoisingTask::reconstruct()
 
        device_sub_ptr color_ptr    (buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride);
        device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
-       functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr);
+
+       device_ptr scale_ptr = 0;
+       device_sub_ptr *scale_sub_ptr = NULL;
+       functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr);
+       delete scale_sub_ptr;
+       functions.solve(target_buffer.ptr);
 }
 
 void DenoisingTask::run_denoising(RenderTile *tile)
@@ -240,8 +280,14 @@ void DenoisingTask::run_denoising(RenderTile *tile)
        prefilter_features();
        prefilter_color();
 
-       construct_transform();
-       reconstruct();
+       if(do_filter) {
+               construct_transform();
+               reconstruct();
+       }
+
+       if(write_passes) {
+               write_buffer();
+       }
 
        functions.unmap_neighbor_tiles(rtiles);
 }
index beae60c220faa7084848a15ea32c47a236d89855..cddcd3bd0c9a11cc2922445f3bfbfb684b3691c8 100644 (file)
@@ -47,6 +47,7 @@ public:
                int stride;
                int pass_stride;
                int denoising_clean_offset;
+               int denoising_output_offset;
                device_ptr ptr;
        } target_buffer;
 
@@ -58,6 +59,9 @@ public:
        int4 rect;
        int4 filter_area;
 
+       bool write_passes;
+       bool do_filter;
+
        struct DeviceFunctions {
                function<bool(device_ptr image_ptr,    /* Contains the values that are smoothed. */
                              device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
@@ -66,8 +70,9 @@ public:
                              )> non_local_means;
                function<bool(device_ptr color_ptr,
                              device_ptr color_variance_ptr,
-                             device_ptr output_ptr
-                             )> reconstruct;
+                             device_ptr scale_ptr
+                             )> accumulate;
+               function<bool(device_ptr output_ptr)> solve;
                function<bool()> construct_transform;
 
                function<bool(device_ptr a_ptr,
@@ -86,13 +91,18 @@ public:
                function<bool(int mean_offset,
                              int variance_offset,
                              device_ptr mean_ptr,
-                             device_ptr variance_ptr
+                             device_ptr variance_ptr,
+                             float scale
                              )> get_feature;
                function<bool(device_ptr image_ptr,
                              device_ptr variance_ptr,
                              device_ptr depth_ptr,
                              device_ptr output_ptr
                              )> detect_outliers;
+               function<bool(int out_offset,
+                             device_ptr frop_ptr,
+                             device_ptr buffer_ptr
+                             )> write_feature;
                function<void(RenderTile *rtiles)> map_neighbor_tiles;
                function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
        } functions;
@@ -114,8 +124,9 @@ public:
                int f;      /* Patch size of the filter. */
                float a;    /* Variance compensation factor in the MSE estimation. */
                float k_2;  /* Squared value of the k parameter of the filter. */
+               bool is_color;
 
-               void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+               void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; }
        } nlm_state;
 
        struct Storage {
@@ -147,6 +158,7 @@ public:
                int width;
                device_only_memory<float> mem;
                device_only_memory<float> temporary_mem;
+               bool use_intensity;
 
                bool gpu_temporary_mem;
 
@@ -166,6 +178,8 @@ protected:
        void prefilter_color();
        void construct_transform();
        void reconstruct();
+
+       void write_buffer();
 };
 
 CCL_NAMESPACE_END
index 861014373b38eed125c95b863986ff1ebbb6c706..97bcde99af6266222f9b9e73ba6fb5691b97fbe3 100644 (file)
@@ -72,7 +72,13 @@ public:
        float denoising_strength;
        float denoising_feature_strength;
        bool denoising_relative_pca;
+       bool denoising_from_render;
+
+       bool denoising_do_filter;
+       bool denoising_write_passes;
+
        int pass_stride;
+       int target_pass_stride;
        int pass_denoising_data;
        int pass_denoising_clean;
 
index ea7ed4f1909cd02cee687211e24be9d37fcbddc5..4d42ddc0c53563dde14a3aeca16e2be34e1447ad 100644 (file)
@@ -419,10 +419,12 @@ protected:
                                       device_ptr out_ptr,
                                       DenoisingTask *task);
        bool denoising_construct_transform(DenoisingTask *task);
-       bool denoising_reconstruct(device_ptr color_ptr,
-                                  device_ptr color_variance_ptr,
-                                  device_ptr output_ptr,
-                                  DenoisingTask *task);
+       bool denoising_accumulate(device_ptr color_ptr,
+                                 device_ptr color_variance_ptr,
+                                 device_ptr scale_ptr,
+                                 DenoisingTask *task);
+       bool denoising_solve(device_ptr output_ptr,
+                            DenoisingTask *task);
        bool denoising_combine_halves(device_ptr a_ptr,
                                      device_ptr b_ptr,
                                      device_ptr mean_ptr,
@@ -439,7 +441,12 @@ protected:
                                   int variance_offset,
                                   device_ptr mean_ptr,
                                   device_ptr variance_ptr,
+                                  float scale,
                                   DenoisingTask *task);
+       bool denoising_write_feature(int to_offset,
+                                    device_ptr from_ptr,
+                                    device_ptr buffer_ptr,
+                                    DenoisingTask *task);
        bool denoising_detect_outliers(device_ptr image_ptr,
                                       device_ptr variance_ptr,
                                       device_ptr depth_ptr,
index d4d7c0f74bca9d90eb85a4cb35312ee0583dfe9c..a0a1cf68c327e30b52c572718ce94c12bcafb2a1 100644 (file)
@@ -748,6 +748,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
 
        int pass_stride = task->buffer.pass_stride;
        int num_shifts = (2*r+1)*(2*r+1);
+       int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
 
        device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
        device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
@@ -760,6 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
        cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
        cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
        cl_mem out_mem = CL_MEM_PTR(out_ptr);
+       cl_mem scale_mem = NULL;
 
        mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
        mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
@@ -773,10 +775,12 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
        kernel_set_args(ckNLMCalcDifference, 0,
                        guide_mem,
                        variance_mem,
+                       scale_mem,
                        difference_mem,
                        w, h, stride,
                        pass_stride,
-                       r, 0, a, k_2);
+                       r, channel_offset,
+                       0, a, k_2);
        kernel_set_args(ckNLMBlur, 0,
                        difference_mem,
                        blurDifference_mem,
@@ -796,6 +800,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
                        weightAccum_mem,
                        w, h, stride,
                        pass_stride,
+                       channel_offset,
                        r, f);
 
        enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
@@ -837,17 +842,14 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
        return true;
 }
 
-bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
-                                             device_ptr color_variance_ptr,
-                                             device_ptr output_ptr,
-                                             DenoisingTask *task)
+bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
+                                            device_ptr color_variance_ptr,
+                                            device_ptr scale_ptr,
+                                            DenoisingTask *task)
 {
-       mem_zero(task->storage.XtWX);
-       mem_zero(task->storage.XtWY);
-
        cl_mem color_mem = CL_MEM_PTR(color_ptr);
        cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
-       cl_mem output_mem = CL_MEM_PTR(output_ptr);
+       cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
 
        cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
        cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
@@ -859,7 +861,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
        cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
        cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-       cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
 
        int w = task->reconstruction_state.source_w;
        int h = task->reconstruction_state.source_h;
@@ -877,6 +878,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        kernel_set_args(ckNLMCalcDifference, 0,
                        color_mem,
                        color_variance_mem,
+                       scale_mem,
                        difference_mem,
                        w, h, stride,
                        pass_stride,
@@ -913,6 +915,22 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
        enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
 
+       return true;
+}
+
+bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr,
+                                       DenoisingTask *task)
+{
+       cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+       cl_mem output_mem = CL_MEM_PTR(output_ptr);
+       cl_mem rank_mem   = CL_MEM_PTR(task->storage.rank.device_pointer);
+       cl_mem XtWX_mem   = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+       cl_mem XtWY_mem   = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+       int w = task->reconstruction_state.source_w;
+       int h = task->reconstruction_state.source_h;
+
        kernel_set_args(ckFinalize, 0,
                        output_mem,
                        rank_mem,
@@ -1000,6 +1018,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
                                              int variance_offset,
                                              device_ptr mean_ptr,
                                              device_ptr variance_ptr,
+                                             float scale,
                                              DenoisingTask *task)
 {
        cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
@@ -1023,6 +1042,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
                        variance_offset,
                        mean_mem,
                        variance_mem,
+                       scale,
                        task->rect,
                        task->render_buffer.pass_stride,
                        task->render_buffer.offset);
@@ -1033,6 +1053,31 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
        return true;
 }
 
+bool OpenCLDeviceBase::denoising_write_feature(int out_offset,
+                                               device_ptr from_ptr,
+                                               device_ptr buffer_ptr,
+                                               DenoisingTask *task)
+{
+       cl_mem from_mem = CL_MEM_PTR(from_ptr);
+       cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
+
+       cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
+
+       kernel_set_args(ckFilterWriteFeature, 0,
+                       task->render_buffer.samples,
+                       task->reconstruction_state.buffer_params,
+                       task->filter_area,
+                       from_mem,
+                       buffer_mem,
+                       out_offset,
+                       task->rect);
+       enqueue_kernel(ckFilterWriteFeature,
+                      task->filter_area.z,
+                      task->filter_area.w);
+
+       return true;
+}
+
 bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
                                                  device_ptr variance_ptr,
                                                  device_ptr depth_ptr,
@@ -1063,11 +1108,13 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
 void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
 {
        denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
-       denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
+       denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising);
+       denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
        denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
        denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
        denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-       denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+       denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+       denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising);
        denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
 
        denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
index 67f4e62ac0f610df158146fcc19cf9a16298c237..9ac7c3db23dd0b26facfeb487cca539d5361b9a5 100644 (file)
@@ -27,6 +27,7 @@ typedef struct TileInfo {
        int strides[9];
        int x[4];
        int y[4];
+       int from_render;
        /* TODO(lukas): CUDA doesn't have uint64_t... */
 #ifdef __KERNEL_OPENCL__
        ccl_global float *buffers[9];
index af73c0dadf26585eed73ea872c9cde98e15ffb93..0c4387af540a3fc2879c5cd8040d0107de24d422 100644 (file)
@@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          const float *ccl_restrict weight_image,
                                                          const float *ccl_restrict variance_image,
+                                                         const float *ccl_restrict scale_image,
                                                          float *difference_image,
                                                          int4 rect,
                                                          int stride,
@@ -41,13 +42,21 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                int idx_q = (y+dy)*stride + aligned_lowx + dx;
                for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
                        float4 diff = make_float4(0.0f);
+                       float4 scale_fac;
+                       if(scale_image) {
+                               scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
+                                                 make_float4(0.25f), make_float4(4.0f));
+                       }
+                       else {
+                               scale_fac = make_float4(1.0f);
+                       }
                        for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
                                /* idx_p is guaranteed to be aligned, but idx_q isn't. */
                                float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
-                               float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
+                               float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs);
                                float4 cdiff = color_p - color_q;
                                float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
-                               float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
+                               float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs);
                                diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
                        }
                        load4_a(difference_image, idx_p) = diff*channel_fac;
@@ -143,6 +152,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                                                        float *out_image,
                                                        float *accum_image,
                                                        int4 rect,
+                                                       int channel_offset,
                                                        int stride,
                                                        int f)
 {
@@ -160,6 +170,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                        load4_a(accum_image, idx_p) += mask(active, weight);
 
                        float4 val = load4_u(image, idx_q);
+                       if(channel_offset) {
+                               val += load4_u(image, idx_q + channel_offset);
+                               val += load4_u(image, idx_q + 2*channel_offset);
+                               val *= 1.0f/3.0f;
+                       }
 
                        load4_a(out_image, idx_p) += mask(active, weight*val);
                }
index 058afb34a920196c5c13090be3eb001af38a6a90..d8e2e4d08aa5f03b6837a827a19f20fd8f1ba318 100644 (file)
@@ -78,17 +78,25 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
                                                          int dx, int dy,
                                                          const ccl_global float *ccl_restrict weight_image,
                                                          const ccl_global float *ccl_restrict variance_image,
+                                                         const ccl_global float *ccl_restrict scale_image,
                                                          ccl_global float *difference_image,
                                                          int4 rect, int stride,
                                                          int channel_offset,
                                                          float a, float k_2)
 {
-       float diff = 0.0f;
+       int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
        int numChannels = channel_offset? 3 : 1;
-       for(int c = 0; c < numChannels; c++) {
-               float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
-               float pvar = variance_image[c*channel_offset + y*stride + x];
-               float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+
+       float diff = 0.0f;
+       float scale_fac = 1.0f;
+       if(scale_image) {
+               scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
+       }
+
+       for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
+               float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q];
+               float pvar = variance_image[idx_p];
+               float qvar = sqr(scale_fac)*variance_image[idx_q];
                diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
        }
        if(numChannels > 1) {
@@ -133,7 +141,8 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
                                                        const ccl_global float *ccl_restrict image,
                                                        ccl_global float *out_image,
                                                        ccl_global float *accum_image,
-                                                       int4 rect, int stride, int f)
+                                                       int4 rect, int channel_offset,
+                                                       int stride, int f)
 {
        float sum = 0.0f;
        const int low = max(rect.x, x-f);
@@ -142,12 +151,21 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
                sum += difference_image[y*stride + x1];
        }
        sum *= 1.0f/(high-low);
+
+       int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
        if(out_image) {
-               atomic_add_and_fetch_float(accum_image + y*stride + x, sum);
-               atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]);
+               atomic_add_and_fetch_float(accum_image + idx_p, sum);
+
+               float val = image[idx_q];
+               if(channel_offset) {
+                       val += image[idx_q + channel_offset];
+                       val += image[idx_q + 2*channel_offset];
+                       val *= 1.0f/3.0f;
+               }
+               atomic_add_and_fetch_float(out_image + idx_p, sum*val);
        }
        else {
-               accum_image[y*stride + x] = sum;
+               accum_image[idx_p] = sum;
        }
 }
 
index 3507f80df4657742da7ee966eee14e9894aa0c3a..41be4dbea49cba8faee1653e0f74ef7e32df977a 100644 (file)
@@ -84,6 +84,7 @@ ccl_device void kernel_filter_get_feature(int sample,
                                           int x, int y,
                                           ccl_global float *mean,
                                           ccl_global float *variance,
+                                          float scale,
                                           int4 rect, int buffer_pass_stride,
                                           int buffer_denoising_offset)
 {
@@ -95,18 +96,38 @@ ccl_device void kernel_filter_get_feature(int sample,
        int buffer_w = align_up(rect.z - rect.x, 4);
        int idx = (y-rect.y)*buffer_w + (x - rect.x);
 
-       mean[idx] = center_buffer[m_offset] / sample;
-       if(sample > 1) {
-               /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-                * update does not work efficiently with atomics in the kernel. */
-               variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
-       }
-       else {
-               /* Can't compute variance with single sample, just set it very high. */
-               variance[idx] = 1e10f;
+       float val = scale * center_buffer[m_offset];
+       mean[idx] = val;
+
+       if(v_offset >= 0) {
+               if(sample > 1) {
+                       /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+                        * update does not work efficiently with atomics in the kernel. */
+                       variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1)));
+               }
+               else {
+                       /* Can't compute variance with single sample, just set it very high. */
+                       variance[idx] = 1e10f;
+               }
        }
 }
 
+ccl_device void kernel_filter_write_feature(int sample,
+                                            int x, int y,
+                                            int4 buffer_params,
+                                            ccl_global float *from,
+                                            ccl_global float *buffer,
+                                            int out_offset,
+                                            int4 rect)
+{
+       ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+
+       int buffer_w = align_up(rect.z - rect.x, 4);
+       int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+       combined_buffer[out_offset] = from[idx];
+}
+
 ccl_device void kernel_filter_detect_outliers(int x, int y,
                                               ccl_global float *image,
                                               ccl_global float *variance,
index 58740d5b06a84fab3587991aabae72ddf3a073c8..e5d3b0da835b04d1e0b926724b62db6954f0ba18 100644 (file)
@@ -108,11 +108,13 @@ ccl_device_inline void kernel_filter_finalize(int x, int y,
        final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
 
        ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
-       final_color *= sample;
-       if(buffer_params.w) {
-               final_color.x += combined_buffer[buffer_params.w+0];
-               final_color.y += combined_buffer[buffer_params.w+1];
-               final_color.z += combined_buffer[buffer_params.w+2];
+       if(buffer_params.w >= 0) {
+               final_color *= sample;
+               if(buffer_params.w > 0) {
+                       final_color.x += combined_buffer[buffer_params.w+0];
+                       final_color.y += combined_buffer[buffer_params.w+1];
+                       final_color.z += combined_buffer[buffer_params.w+2];
+               }
        }
        combined_buffer[0] = final_color.x;
        combined_buffer[1] = final_color.y;
index 864aa7c470a8597a168713fde523062ab5249e84..caa0057d9972d2a1a600ee7f81a72ed1af13761f 100644 (file)
@@ -472,8 +472,17 @@ typedef enum DenoisingPassOffsets {
        DENOISING_PASS_COLOR_VAR          = 23,
        DENOISING_PASS_CLEAN              = 26,
 
+       DENOISING_PASS_PREFILTERED_DEPTH     = 0,
+       DENOISING_PASS_PREFILTERED_NORMAL    = 1,
+       DENOISING_PASS_PREFILTERED_SHADOWING = 4,
+       DENOISING_PASS_PREFILTERED_ALBEDO    = 5,
+       DENOISING_PASS_PREFILTERED_COLOR     = 8,
+       DENOISING_PASS_PREFILTERED_VARIANCE  = 11,
+       DENOISING_PASS_PREFILTERED_INTENSITY = 14,
+
        DENOISING_PASS_SIZE_BASE          = 26,
        DENOISING_PASS_SIZE_CLEAN         = 3,
+       DENOISING_PASS_SIZE_PREFILTERED   = 15,
 } DenoisingPassOffsets;
 
 typedef enum eBakePassFilter {
index e036b53b8108b0802c16fa4eb37218cd2e728d41..08333c7a455c327c4e25b0054ec03065caaf63b8 100644 (file)
@@ -37,10 +37,20 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
                                                    int y,
                                                    float *mean,
                                                    float *variance,
+                                                   float scale,
                                                    int* prefilter_rect,
                                                    int buffer_pass_stride,
                                                    int buffer_denoising_offset);
 
+void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
+                                                     int x,
+                                                     int y,
+                                                     int *buffer_params,
+                                                     float *from,
+                                                     float *buffer,
+                                                     int out_offset,
+                                                     int* prefilter_rect);
+
 void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
                                                        ccl_global float *image,
                                                        ccl_global float *variance,
@@ -71,7 +81,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            int dy,
                                                            float *weight_image,
-                                                           float *variance,
+                                                           float *variance_image,
+                                                           float *scale_image,
                                                            float *difference_image,
                                                            int* rect,
                                                            int stride,
@@ -99,6 +110,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int* rect,
+                                                         int channel_offset,
                                                          int stride,
                                                          int f);
 
index 4c75871148157ff38311add27f794bc31375486b..b792367e3abd02ba30ba09c029bdac54c7e9ce4b 100644 (file)
@@ -69,6 +69,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
                                                    int x,
                                                    int y,
                                                    float *mean, float *variance,
+                                                   float scale,
                                                    int* prefilter_rect,
                                                    int buffer_pass_stride,
                                                    int buffer_denoising_offset)
@@ -80,12 +81,29 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
                                  m_offset, v_offset,
                                  x, y,
                                  mean, variance,
+                                 scale,
                                  load_int4(prefilter_rect),
                                  buffer_pass_stride,
                                  buffer_denoising_offset);
 #endif
 }
 
+void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
+                                                     int x,
+                                                     int y,
+                                                     int *buffer_params,
+                                                     float *from,
+                                                     float *buffer,
+                                                     int out_offset,
+                                                     int* prefilter_rect)
+{
+#ifdef KERNEL_STUB
+       STUB_ASSERT(KERNEL_ARCH, filter_write_feature);
+#else
+       kernel_filter_write_feature(sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect));
+#endif
+}
+
 void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
                                                        ccl_global float *image,
                                                        ccl_global float *variance,
@@ -130,8 +148,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
 #else
-  rank += storage_ofs;
-  transform += storage_ofs*TRANSFORM_SIZE;
+       rank += storage_ofs;
+       transform += storage_ofs*TRANSFORM_SIZE;
        kernel_filter_construct_transform(buffer,
                                          x, y,
                                          load_int4(prefilter_rect),
@@ -146,7 +164,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            int dy,
                                                            float *weight_image,
-                                                           float *variance,
+                                                           float *variance_image,
+                                                           float *scale_image,
                                                            float *difference_image,
                                                            int *rect,
                                                            int stride,
@@ -157,7 +176,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
 #else
-       kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2);
+       kernel_filter_nlm_calc_difference(dx, dy,
+                                         weight_image,
+                                         variance_image,
+                                         scale_image,
+                                         difference_image,
+                                         load_int4(rect),
+                                         stride,
+                                         channel_offset,
+                                         a, k_2);
 #endif
 }
 
@@ -195,13 +222,22 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int *rect,
+                                                         int channel_offset,
                                                          int stride,
                                                          int f)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
 #else
-       kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
+       kernel_filter_nlm_update_output(dx, dy,
+                                       difference_image,
+                                       image,
+                                       temp_image,
+                                       out_image,
+                                       accum_image,
+                                       load_int4(rect),
+                                       channel_offset,
+                                       stride, f);
 #endif
 }
 
@@ -222,7 +258,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
 #else
-       kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride);
+       kernel_filter_nlm_construct_gramian(dx, dy,
+                                           difference_image,
+                                           buffer,
+                                           transform, rank,
+                                           XtWX, XtWY,
+                                           load_int4(rect),
+                                           load_int4(filter_window),
+                                           stride, f,
+                                           pass_stride);
 #endif
 }
 
index b856cbde45c487fdab9d7c127465951bcf6e2008..3b51bb41aed13e70603438ef377587b32638db69 100644 (file)
@@ -64,6 +64,7 @@ kernel_cuda_filter_get_feature(int sample,
                                int v_offset,
                                float *mean,
                                float *variance,
+                               float scale,
                                int4 prefilter_rect,
                                int buffer_pass_stride,
                                int buffer_denoising_offset)
@@ -76,12 +77,37 @@ kernel_cuda_filter_get_feature(int sample,
                                          m_offset, v_offset,
                                          x, y,
                                          mean, variance,
+                                         scale,
                                          prefilter_rect,
                                          buffer_pass_stride,
                                          buffer_denoising_offset);
        }
 }
 
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_write_feature(int sample,
+                                 int4 buffer_params,
+                                 int4 filter_area,
+                                 float *from,
+                                 float *buffer,
+                                 int out_offset,
+                                 int4 prefilter_rect)
+{
+       int x = blockDim.x*blockIdx.x + threadIdx.x;
+       int y = blockDim.y*blockIdx.y + threadIdx.y;
+       if(x < filter_area.z && y < filter_area.w) {
+               kernel_filter_write_feature(sample,
+                                           x + filter_area.x,
+                                           y + filter_area.y,
+                                           buffer_params,
+                                           from,
+                                           buffer,
+                                           out_offset,
+                                           prefilter_rect);
+       }
+}
+
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_detect_outliers(float *image,
@@ -136,6 +162,7 @@ extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        const float *ccl_restrict variance_image,
+                                       const float *ccl_restrict scale_image,
                                        float *difference_image,
                                        int w,
                                        int h,
@@ -152,9 +179,11 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
                                                  weight_image,
                                                  variance_image,
+                                                 scale_image,
                                                  difference_image + ofs,
                                                  rect, stride,
-                                                 channel_offset, a, k_2);
+                                                 channel_offset,
+                                                 a, k_2);
        }
 }
 
@@ -210,6 +239,7 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      int h,
                                      int stride,
                                      int pass_stride,
+                                     int channel_offset,
                                      int r,
                                      int f)
 {
@@ -221,7 +251,9 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                                image,
                                                out_image,
                                                accum_image,
-                                               rect, stride, f);
+                                               rect,
+                                               channel_offset,
+                                               stride, f);
        }
 }
 
index a550f97f4eb6839d7f6266fa5c8567e71220adc3..8a821ee281d08223091b4abd186874d2dcef5e2b 100644 (file)
@@ -56,6 +56,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
                                             int v_offset,
                                             ccl_global float *mean,
                                             ccl_global float *variance,
+                                            float scale,
                                             int4 prefilter_rect,
                                             int buffer_pass_stride,
                                             int buffer_denoising_offset)
@@ -68,12 +69,35 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
                                          m_offset, v_offset,
                                          x, y,
                                          mean, variance,
+                                         scale,
                                          prefilter_rect,
                                          buffer_pass_stride,
                                          buffer_denoising_offset);
        }
 }
 
+__kernel void kernel_ocl_filter_write_feature(int sample,
+                                              int4 buffer_params,
+                                              int4 filter_area,
+                                              ccl_global float *from,
+                                              ccl_global float *buffer,
+                                              int out_offset,
+                                              int4 prefilter_rect)
+{
+       int x = get_global_id(0);
+       int y = get_global_id(1);
+       if(x < filter_area.z && y < filter_area.w) {
+               kernel_filter_write_feature(sample,
+                                           x + filter_area.x,
+                                           y + filter_area.y,
+                                           buffer_params,
+                                           from,
+                                           buffer,
+                                           out_offset,
+                                           prefilter_rect);
+       }
+}
+
 __kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
                                                 ccl_global float *variance,
                                                 ccl_global float *depth,
@@ -128,6 +152,7 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
 
 __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
                                                     const ccl_global float *ccl_restrict variance_image,
+                                                    const ccl_global float *ccl_restrict scale_image,
                                                     ccl_global float *difference_image,
                                                     int w,
                                                     int h,
@@ -144,9 +169,11 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
                                                  weight_image,
                                                  variance_image,
+                                                 scale_image,
                                                  difference_image + ofs,
                                                  rect, stride,
-                                                 channel_offset, a, k_2);
+                                                 channel_offset,
+                                                 a, k_2);
        }
 }
 
@@ -196,6 +223,7 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
                                                   int h,
                                                   int stride,
                                                   int pass_stride,
+                                                  int channel_offset,
                                                   int r,
                                                   int f)
 {
@@ -207,7 +235,9 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
                                                image,
                                                out_image,
                                                accum_image,
-                                               rect, stride, f);
+                                               rect,
+                                               channel_offset,
+                                               stride, f);
        }
 }
 
index f901885e679ac94db46ca7daafca587462cbe2d4..66b8ef73acc1180b3ad34df4538f67ccabc49f6c 100644 (file)
@@ -42,6 +42,7 @@ BufferParams::BufferParams()
 
        denoising_data_pass = false;
        denoising_clean_pass = false;
+       denoising_prefiltered_pass = false;
 
        Pass::add(PASS_COMBINED, passes);
 }
@@ -73,6 +74,7 @@ int BufferParams::get_passes_size()
        if(denoising_data_pass) {
                size += DENOISING_PASS_SIZE_BASE;
                if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+               if(denoising_prefiltered_pass) size += DENOISING_PASS_SIZE_PREFILTERED;
        }
 
        return align_up(size, 4);
@@ -88,6 +90,20 @@ int BufferParams::get_denoising_offset()
        return offset;
 }
 
+int BufferParams::get_denoising_prefiltered_offset()
+{
+       assert(denoising_prefiltered_pass);
+
+       int offset = get_denoising_offset();
+
+       offset += DENOISING_PASS_SIZE_BASE;
+       if(denoising_clean_pass) {
+               offset += DENOISING_PASS_SIZE_CLEAN;
+       }
+
+       return offset;
+}
+
 /* Render Buffer Task */
 
 RenderTile::RenderTile()
@@ -153,81 +169,62 @@ bool RenderBuffers::get_denoising_pass_rect(int type, float exposure, int sample
                return false;
        }
 
-       float invsample = 1.0f/sample;
-       float scale = invsample;
-       bool variance = (type == DENOISING_PASS_NORMAL_VAR) ||
-                       (type == DENOISING_PASS_ALBEDO_VAR) ||
-                       (type == DENOISING_PASS_DEPTH_VAR) ||
-                       (type == DENOISING_PASS_COLOR_VAR);
+       float scale = 1.0f;
+       float alpha_scale = 1.0f/sample;
+       if(type == DENOISING_PASS_PREFILTERED_COLOR ||
+          type == DENOISING_PASS_CLEAN ||
+          type == DENOISING_PASS_PREFILTERED_INTENSITY) {
+               scale *= exposure;
+       }
+       else if(type == DENOISING_PASS_PREFILTERED_VARIANCE) {
+               scale *= exposure*exposure * (sample - 1);
+       }
 
-       float scale_exposure = scale;
-       if(type == DENOISING_PASS_COLOR || type == DENOISING_PASS_CLEAN) {
-               scale_exposure *= exposure;
+       int offset;
+       if(type == DENOISING_PASS_CLEAN) {
+               /* The clean pass isn't changed by prefiltering, so we use the original one there. */
+               offset = type + params.get_denoising_offset();
        }
-       else if(type == DENOISING_PASS_COLOR_VAR) {
-               scale_exposure *= exposure*exposure;
+       else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) {
+               /* If we're not saving the prefiltering result, return the original noisy pass. */
+               offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
+               scale /= sample;
+       }
+       else {
+               offset = type + params.get_denoising_prefiltered_offset();
        }
 
-       int offset = type + params.get_denoising_offset();
        int pass_stride = params.get_passes_size();
        int size = params.width*params.height;
 
-       if(variance) {
-               /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-                * update does not work efficiently with atomics in the kernel. */
-               int mean_offset = offset - components;
-               float *mean = buffer.data() + mean_offset;
-               float *var = buffer.data() + offset;
-               assert(mean_offset >= 0);
-
-               if(components == 1) {
-                       for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) {
-                               pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure;
-                       }
+       float *in = buffer.data() + offset;
+
+       if(components == 1) {
+               for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+                       pixels[0] = in[0]*scale;
                }
-               else if(components == 3) {
-                       for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) {
-                               pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure;
-                               pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale_exposure;
-                               pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale_exposure;
-                       }
+       }
+       else if(components == 3) {
+               for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+                       pixels[0] = in[0]*scale;
+                       pixels[1] = in[1]*scale;
+                       pixels[2] = in[2]*scale;
                }
-               else {
-                       return false;
+       }
+       else if(components == 4) {
+               /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
+               assert(params.passes[0].type == PASS_COMBINED);
+               float *in_combined = buffer.data();
+
+               for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
+                       pixels[0] = in[0]*scale;
+                       pixels[1] = in[1]*scale;
+                       pixels[2] = in[2]*scale;
+                       pixels[3] = saturate(in_combined[3]*alpha_scale);
                }
        }
        else {
-               float *in = buffer.data() + offset;
-
-               if(components == 1) {
-                       for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
-                               pixels[0] = in[0]*scale_exposure;
-                       }
-               }
-               else if(components == 3) {
-                       for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-                               pixels[0] = in[0]*scale_exposure;
-                               pixels[1] = in[1]*scale_exposure;
-                               pixels[2] = in[2]*scale_exposure;
-                       }
-               }
-               else if(components == 4) {
-                       assert(type == DENOISING_PASS_COLOR);
-
-                       /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
-                       assert(params.passes[0].type == PASS_COMBINED);
-                       float *in_combined = buffer.data();
-
-                       for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
-                               pixels[0] = in[0]*scale_exposure;
-                               pixels[1] = in[1]*scale_exposure;
-                               pixels[2] = in[2]*scale_exposure;
-                               pixels[3] = saturate(in_combined[3]*scale);
-                       }
-               }
-               else {
-                       return false;
-               }
+               return false;
        }
 
        return true;
index 46c3b89bd842338b83aa923fd9fdbfe2f4f693cb..0a010718d6d3a3b452f93084a2e6b1ce25d01045 100644 (file)
@@ -54,6 +54,10 @@ public:
        bool denoising_data_pass;
        /* If only some light path types should be denoised, an additional pass is needed. */
        bool denoising_clean_pass;
+       /* When we're prefiltering the passes during rendering, we need to keep both the
+        * original and the prefiltered data around because neighboring tiles might still
+        * need the original data. */
+       bool denoising_prefiltered_pass;
 
        /* functions */
        BufferParams();
@@ -63,6 +67,7 @@ public:
        void add_pass(PassType type);
        int get_passes_size();
        int get_denoising_offset();
+       int get_denoising_prefiltered_offset();
 };
 
 /* Render Buffers */
index d0f15496e50a40d9b0d59ef46d3537cde49e6e3e..b305fa59123029f3cd282458f876dc52b3c1bd9e 100644 (file)
@@ -286,6 +286,7 @@ NODE_DEFINE(Film)
 
        SOCKET_BOOLEAN(denoising_data_pass,  "Generate Denoising Data Pass",  false);
        SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+       SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
        SOCKET_INT(denoising_flags, "Denoising Flags", 0);
 
        return type;
@@ -469,6 +470,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
                        kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
                        kfilm->use_light_pass = 1;
                }
+               if(denoising_prefiltered_pass) {
+                       kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED;
+               }
        }
 
        kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
index c597db4e4c5406d19b52f1ba2a9797c6edee5308..8330a4cf413e7fc01ae11eeb735f3719497d1bbd 100644 (file)
@@ -60,6 +60,7 @@ public:
        vector<Pass> passes;
        bool denoising_data_pass;
        bool denoising_clean_pass;
+       bool denoising_prefiltered_pass;
        int denoising_flags;
        float pass_alpha_threshold;
 
index c818f2b496cd77f5ad0c0edbb56806665ef97880..3cee3b8bece812b29100350904e7367fc8f6af39 100644 (file)
@@ -689,7 +689,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
        BakeManager *bake_manager = scene->bake_manager;
        requested_features.use_baking = bake_manager->get_baking();
        requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
-       if(params.denoising_passes) {
+       if(params.run_denoising) {
                requested_features.use_denoising = true;
                requested_features.use_shadow_tricks = true;
        }
@@ -927,7 +927,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
                         */
                        substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
                }
-               if(params.use_denoising) {
+               if(params.run_denoising) {
                        substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
                }
        }
@@ -975,7 +975,7 @@ void Session::render()
        task.requested_tile_size = params.tile_size;
        task.passes_size = tile_manager.params.get_passes_size();
 
-       if(params.use_denoising) {
+       if(params.run_denoising) {
                task.denoising_radius = params.denoising_radius;
                task.denoising_strength = params.denoising_strength;
                task.denoising_feature_strength = params.denoising_feature_strength;
@@ -983,8 +983,13 @@ void Session::render()
 
                assert(!scene->film->need_update);
                task.pass_stride = scene->film->pass_stride;
+               task.target_pass_stride = task.pass_stride;
                task.pass_denoising_data = scene->film->denoising_data_offset;
                task.pass_denoising_clean = scene->film->denoising_clean_offset;
+
+               task.denoising_from_render = true;
+               task.denoising_do_filter = params.full_denoising;
+               task.denoising_write_passes = params.write_denoising_passes;
        }
 
        device->task_add(task);
index c7f590915e7e4700153b066c27904c6f77c3bc2a..cb1d8fed68faa951aebd183ed0a264c494eef911 100644 (file)
@@ -60,8 +60,9 @@ public:
 
        bool display_buffer_linear;
 
-       bool use_denoising;
-       bool denoising_passes;
+       bool run_denoising;
+       bool write_denoising_passes;
+       bool full_denoising;
        int denoising_radius;
        float denoising_strength;
        float denoising_feature_strength;
@@ -94,8 +95,9 @@ public:
 
                use_profiling = false;
 
-               use_denoising = false;
-               denoising_passes = false;
+               run_denoising = false;
+               write_denoising_passes = false;
+               full_denoising = false;
                denoising_radius = 8;
                denoising_strength = 0.0f;
                denoising_feature_strength = 0.0f;