Cycles: animation denoising support in the kernel.
authorLukas Stockner <lukas.stockner@freenet.de>
Wed, 6 Feb 2019 13:19:20 +0000 (14:19 +0100)
committerBrecht Van Lommel <brechtvanlommel@gmail.com>
Wed, 6 Feb 2019 14:18:42 +0000 (15:18 +0100)
This is the internal implementation, not available from the API or
interface yet. The algorithm takes into account past and future frames,
both to get more coherent animation and reduce noise.

Ref D3889.

20 files changed:
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_denoising.cpp
intern/cycles/device/device_denoising.h
intern/cycles/device/device_task.h
intern/cycles/device/opencl/opencl.h
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/kernel/filter/filter_defines.h
intern/cycles/kernel/filter/filter_features.h
intern/cycles/kernel/filter/filter_features_sse.h
intern/cycles/kernel/filter/filter_nlm_cpu.h
intern/cycles/kernel/filter/filter_nlm_gpu.h
intern/cycles/kernel/filter/filter_reconstruction.h
intern/cycles/kernel/filter/filter_transform.h
intern/cycles/kernel/filter/filter_transform_gpu.h
intern/cycles/kernel/filter/filter_transform_sse.h
intern/cycles/kernel/kernels/cpu/filter_cpu.h
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
intern/cycles/kernel/kernels/cuda/filter.cu
intern/cycles/kernel/kernels/opencl/filter.cl

index 6668acc9cbe3b530ff384a16e8aaee896e16b612..93c63b92a557939c6d2fa82628a7e2cf240a1103 100644 (file)
@@ -186,15 +186,15 @@ public:
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
 
-       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)>      filter_nlm_calc_difference_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel;
        KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_blur_kernel;
        KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_calc_weight_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)>       filter_nlm_update_output_kernel;
        KernelFunctions<void(*)(float*, float*, int*, int)>                                                   filter_nlm_normalize_kernel;
 
-       KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
-       KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
-       KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
+       KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)>                   filter_construct_transform_kernel;
+       KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel;
+       KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                                            filter_finalize_kernel;
 
        KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
                               int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -512,7 +512,7 @@ public:
                                                            difference,
                                                            local_rect,
                                                            w, channel_offset,
-                                                           a, k_2);
+                                                           0, a, k_2);
 
                        filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
                        filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
@@ -542,6 +542,7 @@ public:
                for(int y = 0; y < task->filter_area.w; y++) {
                        for(int x = 0; x < task->filter_area.z; x++) {
                                filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+                                                                   task->tile_info,
                                                                    x + task->filter_area.x,
                                                                    y + task->filter_area.y,
                                                                    y*task->filter_area.z + x,
@@ -549,6 +550,8 @@ public:
                                                                    (int*)   task->storage.rank.device_pointer,
                                                                    &task->rect.x,
                                                                    task->buffer.pass_stride,
+                                                                   task->buffer.frame_stride,
+                                                                   task->buffer.use_time,
                                                                    task->radius,
                                                                    task->pca_threshold);
                        }
@@ -559,6 +562,7 @@ public:
        bool denoising_accumulate(device_ptr color_ptr,
                                  device_ptr color_variance_ptr,
                                  device_ptr scale_ptr,
+                                 int frame,
                                  DenoisingTask *task)
        {
                ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
@@ -568,6 +572,7 @@ public:
                float *blurDifference = temporary_mem + task->buffer.pass_stride;
 
                int r = task->radius;
+               int frame_offset = frame * task->buffer.frame_stride;
                for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
                        int dy = i / (2*r+1) - r;
                        int dx = i % (2*r+1) - r;
@@ -583,12 +588,14 @@ public:
                                                            local_rect,
                                                            task->buffer.stride,
                                                            task->buffer.pass_stride,
+                                                           frame_offset,
                                                            1.0f,
                                                            task->nlm_k_2);
                        filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
                        filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
                        filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
                        filter_nlm_construct_gramian_kernel()(dx, dy,
+                                                             task->tile_info->frames[frame],
                                                              blurDifference,
                                                              (float*)  task->buffer.mem.device_pointer,
                                                              (float*)  task->storage.transform.device_pointer,
@@ -599,7 +606,9 @@ public:
                                                              &task->reconstruction_state.filter_window.x,
                                                              task->buffer.stride,
                                                              4,
-                                                             task->buffer.pass_stride);
+                                                             task->buffer.pass_stride,
+                                                             frame_offset,
+                                                             task->buffer.use_time);
                }
 
                return true;
@@ -787,7 +796,7 @@ public:
                tile.sample = tile.start_sample + tile.num_samples;
 
                denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
-               denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+               denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
                denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
                denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
                denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
index cb7d8bbb2248d5de08abe0b65a0745e7a5ec81c1..e21d974ebbe5071ea2a8c2662f92c640cf090a12 100644 (file)
@@ -1301,6 +1301,7 @@ public:
                int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
                int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
+               int frame_offset = 0;
 
                if(have_error())
                        return false;
@@ -1327,7 +1328,7 @@ public:
 
                        CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
 
-                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
                        void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
                        void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
                        void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
@@ -1367,13 +1368,16 @@ public:
                                   task->storage.h);
 
                void *args[] = {&task->buffer.mem.device_pointer,
+                               &task->tile_info_mem.device_pointer,
                                &task->storage.transform.device_pointer,
                                &task->storage.rank.device_pointer,
                                &task->filter_area,
                                &task->rect,
                                &task->radius,
                                &task->pca_threshold,
-                               &task->buffer.pass_stride};
+                               &task->buffer.pass_stride,
+                               &task->buffer.frame_stride,
+                               &task->buffer.use_time};
                CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
                cuda_assert(cuCtxSynchronize());
 
@@ -1383,6 +1387,7 @@ public:
        bool denoising_accumulate(device_ptr color_ptr,
                                  device_ptr color_variance_ptr,
                                  device_ptr scale_ptr,
+                                 int frame,
                                  DenoisingTask *task)
        {
                if(have_error())
@@ -1398,6 +1403,8 @@ public:
                int w = task->reconstruction_state.source_w;
                int h = task->reconstruction_state.source_h;
                int stride = task->buffer.stride;
+               int frame_offset = frame * task->buffer.frame_stride;
+               int t = task->tile_info->frames[frame];
 
                int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
@@ -1430,10 +1437,12 @@ public:
                                                &w, &h,
                                                &stride, &pass_stride,
                                                &r, &pass_stride,
+                                               &frame_offset,
                                                &a, &k_2};
                void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
                void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-               void *construct_gramian_args[] = {&blurDifference,
+               void *construct_gramian_args[] = {&t,
+                                                 &blurDifference,
                                                  &task->buffer.mem.device_pointer,
                                                  &task->storage.transform.device_pointer,
                                                  &task->storage.rank.device_pointer,
@@ -1442,7 +1451,9 @@ public:
                                                  &task->reconstruction_state.filter_window,
                                                  &w, &h, &stride,
                                                  &pass_stride, &r,
-                                                 &f};
+                                                 &f,
+                                                 &frame_offset,
+                                                 &task->buffer.use_time};
 
                CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
                CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1635,7 +1646,7 @@ public:
        void denoise(RenderTile &rtile, DenoisingTask& denoising)
        {
                denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
-               denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+               denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
                denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
                denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
                denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
index 724171c3acba82bbd23319e187d9d0c8f26f4621..61e0ba47ab8e0c391ed675491db3964a1d95b552 100644 (file)
@@ -36,6 +36,7 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
                pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
        }
 
+       render_buffer.frame_stride = task.frame_stride;
        render_buffer.pass_stride = task.pass_stride;
        render_buffer.offset = task.pass_denoising_data;
 
@@ -49,6 +50,12 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
        tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
        tile_info->from_render = task.denoising_from_render? 1 : 0;
 
+       tile_info->frames[0] = 0;
+       tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
+       for(int i = 1; i < tile_info->num_frames; i++) {
+               tile_info->frames[i] = task.denoising_frames[i-1];
+       }
+
        write_passes = task.denoising_write_passes;
        do_filter = task.denoising_do_filter;
 }
@@ -101,16 +108,18 @@ void DenoisingTask::setup_denoising_buffer()
        rect = rect_expand(rect, radius);
        rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
 
-       buffer.use_intensity = write_passes;
+       buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
        buffer.passes = buffer.use_intensity? 15 : 14;
        buffer.width = rect.z - rect.x;
        buffer.stride = align_up(buffer.width, 4);
        buffer.h = rect.w - rect.y;
        int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
        buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
+       buffer.frame_stride = buffer.pass_stride * buffer.passes;
        /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
-       int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats);
+       int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
        buffer.mem.alloc_to_device(mem_size, false);
+       buffer.use_time = (tile_info->num_frames > 1);
 
        /* CPUs process shifts sequentially while GPUs process them in parallel. */
        int num_layers;
@@ -216,6 +225,25 @@ void DenoisingTask::prefilter_color()
        }
 }
 
+void DenoisingTask::load_buffer()
+{
+       device_ptr null_ptr = (device_ptr) 0;
+
+       int original_offset = render_buffer.offset;
+
+       int num_passes = buffer.use_intensity? 15 : 14;
+       for(int i = 0; i < tile_info->num_frames; i++) {
+               for(int pass = 0; pass < num_passes; pass++) {
+                       device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride);
+                       bool is_variance = (pass >= 11) && (pass <= 13);
+                       functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f);
+               }
+               render_buffer.offset += render_buffer.frame_stride;
+       }
+
+       render_buffer.offset = original_offset;
+}
+
 void DenoisingTask::write_buffer()
 {
        reconstruction_state.buffer_params = make_int4(target_buffer.offset,
@@ -259,11 +287,17 @@ void DenoisingTask::reconstruct()
 
        device_sub_ptr color_ptr    (buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride);
        device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
-
-       device_ptr scale_ptr = 0;
-       device_sub_ptr *scale_sub_ptr = NULL;
-       functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr);
-       delete scale_sub_ptr;
+       for(int f = 0; f < tile_info->num_frames; f++) {
+               device_ptr scale_ptr = 0;
+               device_sub_ptr *scale_sub_ptr = NULL;
+               if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
+                       scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+                       scale_ptr = **scale_sub_ptr;
+               }
+
+               functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
+               delete scale_sub_ptr;
+       }
        functions.solve(target_buffer.ptr);
 }
 
@@ -276,9 +310,14 @@ void DenoisingTask::run_denoising(RenderTile *tile)
 
        setup_denoising_buffer();
 
-       prefilter_shadowing();
-       prefilter_features();
-       prefilter_color();
+       if(tile_info->from_render) {
+               prefilter_shadowing();
+               prefilter_features();
+               prefilter_color();
+       }
+       else {
+               load_buffer();
+       }
 
        if(do_filter) {
                construct_transform();
index cddcd3bd0c9a11cc2922445f3bfbfb684b3691c8..5869aa05390045913719061cc7af31c3bdca1560 100644 (file)
@@ -38,6 +38,7 @@ public:
        struct RenderBuffers {
                int offset;
                int pass_stride;
+               int frame_stride;
                int samples;
        } render_buffer;
 
@@ -70,7 +71,8 @@ public:
                              )> non_local_means;
                function<bool(device_ptr color_ptr,
                              device_ptr color_variance_ptr,
-                             device_ptr scale_ptr
+                             device_ptr scale_ptr,
+                             int frame
                              )> accumulate;
                function<bool(device_ptr output_ptr)> solve;
                function<bool()> construct_transform;
@@ -156,8 +158,10 @@ public:
                int stride;
                int h;
                int width;
+               int frame_stride;
                device_only_memory<float> mem;
                device_only_memory<float> temporary_mem;
+               bool use_time;
                bool use_intensity;
 
                bool gpu_temporary_mem;
@@ -179,6 +183,7 @@ protected:
        void construct_transform();
        void reconstruct();
 
+       void load_buffer();
        void write_buffer();
 };
 
index 97bcde99af6266222f9b9e73ba6fb5691b97fbe3..2871bc5761a894bc140fb1fb0bee795c59e98714 100644 (file)
@@ -73,11 +73,13 @@ public:
        float denoising_feature_strength;
        bool denoising_relative_pca;
        bool denoising_from_render;
+       vector<int> denoising_frames;
 
        bool denoising_do_filter;
        bool denoising_write_passes;
 
        int pass_stride;
+       int frame_stride;
        int target_pass_stride;
        int pass_denoising_data;
        int pass_denoising_clean;
index 4d42ddc0c53563dde14a3aeca16e2be34e1447ad..9b7631674594ca23d9a09d520e8867b9369b6bd2 100644 (file)
@@ -422,6 +422,7 @@ protected:
        bool denoising_accumulate(device_ptr color_ptr,
                                  device_ptr color_variance_ptr,
                                  device_ptr scale_ptr,
+                                 int frame,
                                  DenoisingTask *task);
        bool denoising_solve(device_ptr output_ptr,
                             DenoisingTask *task);
index a0a1cf68c327e30b52c572718ce94c12bcafb2a1..4417065bb7fb5770c4da628b7ac5e9cc53301fe9 100644 (file)
@@ -821,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
        cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
        cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
        cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+       cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+       char use_time = task->buffer.use_time? 1 : 0;
 
        cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
 
-       kernel_set_args(ckFilterConstructTransform, 0,
-                       buffer_mem,
+       int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
+                                     buffer_mem,
+                                     tile_info_mem);
+       cl_mem buffers[9];
+       for(int i = 0; i < 9; i++) {
+               buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+               arg_ofs += kernel_set_args(ckFilterConstructTransform,
+                                          arg_ofs,
+                                          buffers[i]);
+       }
+       kernel_set_args(ckFilterConstructTransform,
+                       arg_ofs,
                        transform_mem,
                        rank_mem,
                        task->filter_area,
                        task->rect,
                        task->buffer.pass_stride,
+                       task->buffer.frame_stride,
+                       use_time,
                        task->radius,
                        task->pca_threshold);
 
@@ -845,6 +860,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
 bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
                                             device_ptr color_variance_ptr,
                                             device_ptr scale_ptr,
+                                            int frame,
                                             DenoisingTask *task)
 {
        cl_mem color_mem = CL_MEM_PTR(color_ptr);
@@ -865,6 +881,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
        int w = task->reconstruction_state.source_w;
        int h = task->reconstruction_state.source_h;
        int stride = task->buffer.stride;
+       int frame_offset = frame * task->buffer.frame_stride;
+       int t = task->tile_info->frames[frame];
+       char use_time = task->buffer.use_time? 1 : 0;
 
        int r = task->radius;
        int pass_stride = task->buffer.pass_stride;
@@ -884,6 +903,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
                        pass_stride,
                        r,
                        pass_stride,
+                       frame_offset,
                        1.0f, task->nlm_k_2);
        kernel_set_args(ckNLMBlur, 0,
                        difference_mem,
@@ -898,6 +918,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
                        pass_stride,
                        r, 4);
        kernel_set_args(ckNLMConstructGramian, 0,
+                       t,
                        blurDifference_mem,
                        buffer_mem,
                        transform_mem,
@@ -907,7 +928,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
                        task->reconstruction_state.filter_window,
                        w, h, stride,
                        pass_stride,
-                       r, 4);
+                       r, 4,
+                       frame_offset,
+                       use_time);
 
        enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
        enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
@@ -1108,7 +1131,7 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
 void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
 {
        denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
-       denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising);
+       denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
        denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
        denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
        denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
index 9ac7c3db23dd0b26facfeb487cca539d5361b9a5..cb04aac35f4e8d3a7e6f585d2739dcd79d69f6da 100644 (file)
 #ifndef __FILTER_DEFINES_H__
 #define __FILTER_DEFINES_H__
 
-#define DENOISE_FEATURES 10
+#define DENOISE_FEATURES 11
 #define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
 #define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
 #define XTWY_SIZE      (DENOISE_FEATURES+1)
 
+#define DENOISE_MAX_FRAMES 16
+
 typedef struct TileInfo {
        int offsets[9];
        int strides[9];
        int x[4];
        int y[4];
        int from_render;
+       int frames[DENOISE_MAX_FRAMES];
+       int num_frames;
        /* TODO(lukas): CUDA doesn't have uint64_t... */
 #ifdef __KERNEL_OPENCL__
        ccl_global float *buffers[9];
index 6226ed2c2eff21b38c2ffaebb650dd5ae7ac1c36..e1ea6487aa94ea4c3707dbe0db5124aaae055b45 100644 (file)
 
 #define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
 
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
- * pixel_buffer always points to the current pixel in the first pass. */
-#define FOR_PIXEL_WINDOW     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
-                             for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                 for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass.
+ * Repeat the loop for every secondary frame if there are any. */
+#define FOR_PIXEL_WINDOW     for(int frame = 0; frame < tile_info->num_frames; frame++) { \
+                                 pixel.z = tile_info->frames[frame]; \
+                                 pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
+                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
 
-#define END_FOR_PIXEL_WINDOW     } \
-                                 pixel_buffer += buffer_w - (high.x - low.x); \
+#define END_FOR_PIXEL_WINDOW         } \
+                                     pixel_buffer += buffer_w - (high.x - low.x); \
+                                 } \
                              }
 
-ccl_device_inline void filter_get_features(int2 pixel,
+ccl_device_inline void filter_get_features(int3 pixel,
                                            const ccl_global float *ccl_restrict buffer,
                                            float *features,
+                                           bool use_time,
                                            const float *ccl_restrict mean,
                                            int pass_stride)
 {
@@ -44,15 +48,20 @@ ccl_device_inline void filter_get_features(int2 pixel,
        features[7] = ccl_get_feature(buffer, 5);
        features[8] = ccl_get_feature(buffer, 6);
        features[9] = ccl_get_feature(buffer, 7);
+       if(use_time) {
+               features[10] = pixel.z;
+       }
        if(mean) {
-               for(int i = 0; i < DENOISE_FEATURES; i++)
+               for(int i = 0; i < (use_time? 11 : 10); i++) {
                        features[i] -= mean[i];
+               }
        }
 }
 
-ccl_device_inline void filter_get_feature_scales(int2 pixel,
+ccl_device_inline void filter_get_feature_scales(int3 pixel,
                                                  const ccl_global float *ccl_restrict buffer,
                                                  float *scales,
+                                                 bool use_time,
                                                  const float *ccl_restrict mean,
                                                  int pass_stride)
 {
@@ -66,13 +75,19 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel,
        scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
                                            ccl_get_feature(buffer, 6) - mean[8],
                                            ccl_get_feature(buffer, 7) - mean[9]));
+       if(use_time) {
+               scales[6] = fabsf(pixel.z - mean[10]);
+       }
 }
 
-ccl_device_inline void filter_calculate_scale(float *scale)
+ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
 {
        scale[0] = 1.0f/max(scale[0], 0.01f);
        scale[1] = 1.0f/max(scale[1], 0.01f);
        scale[2] = 1.0f/max(scale[2], 0.01f);
+       if(use_time) {
+               scale[10] = 1.0f/max(scale[6], 0.01f);
+       }
        scale[6] = 1.0f/max(scale[4], 0.01f);
        scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
        scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
@@ -89,36 +104,46 @@ ccl_device_inline void design_row_add(float *design_row,
                                       const ccl_global float *ccl_restrict transform,
                                       int stride,
                                       int row,
-                                      float feature)
+                                      float feature,
+                                      int transform_row_stride)
 {
        for(int i = 0; i < rank; i++) {
-               design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+               design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature;
        }
 }
 
 /* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ccl_device_inline void filter_get_design_row_transform(int3 p_pixel,
                                                        const ccl_global float *ccl_restrict p_buffer,
-                                                       int2 q_pixel,
+                                                       int3 q_pixel,
                                                        const ccl_global float *ccl_restrict q_buffer,
                                                        int pass_stride,
                                                        int rank,
                                                        float *design_row,
                                                        const ccl_global float *ccl_restrict transform,
-                                                       int stride)
+                                                       int stride,
+                                                       bool use_time)
 {
+       int num_features = use_time? 11 : 10;
+
        design_row[0] = 1.0f;
        math_vector_zero(design_row+1, rank);
-       design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
-       design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
-       design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
-       design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
-       design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
-       design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
-       design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
-       design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
-       design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
-       design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+
+#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features);
+       DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
+       DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
+       DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+       DESIGN_ROW_ADD(3,       ccl_get_feature(q_buffer, 1)  -       ccl_get_feature(p_buffer, 1));
+       DESIGN_ROW_ADD(4,       ccl_get_feature(q_buffer, 2)  -       ccl_get_feature(p_buffer, 2));
+       DESIGN_ROW_ADD(5,       ccl_get_feature(q_buffer, 3)  -       ccl_get_feature(p_buffer, 3));
+       DESIGN_ROW_ADD(6,       ccl_get_feature(q_buffer, 4)  -       ccl_get_feature(p_buffer, 4));
+       DESIGN_ROW_ADD(7,       ccl_get_feature(q_buffer, 5)  -       ccl_get_feature(p_buffer, 5));
+       DESIGN_ROW_ADD(8,       ccl_get_feature(q_buffer, 6)  -       ccl_get_feature(p_buffer, 6));
+       DESIGN_ROW_ADD(9,       ccl_get_feature(q_buffer, 7)  -       ccl_get_feature(p_buffer, 7));
+       if(use_time) {
+               DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
+       }
+#undef DESIGN_ROW_ADD
 }
 
 CCL_NAMESPACE_END
index 3ddd871226647e7b01cfbd2eb81ebf4e60453f8f..5dd001ffb931f1d97b5ea0e7ceb6057a9c061d1c 100644 (file)
@@ -20,26 +20,33 @@ CCL_NAMESPACE_BEGIN
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window.
+ * Repeat the loop for every secondary frame if there are any. */
+#define FOR_PIXEL_WINDOW_SSE     for(int frame = 0; frame < tile_info->num_frames; frame++) { \
+                                     pixel.z = tile_info->frames[frame]; \
+                                     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
+                                     float4 t4 = make_float4(pixel.z); \
+                                     for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                         float4 y4 = make_float4(pixel.y); \
+                                         for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+                                             float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                             int4 active_pixels = x4 < make_float4(high.x);
 
-#define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
-                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                     float4 y4 = make_float4(pixel.y); \
-                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
-                                         int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE     } \
-                                     pixel_buffer += buffer_w - (pixel.x - low.x); \
+#define END_FOR_PIXEL_WINDOW_SSE         } \
+                                         pixel_buffer += buffer_w - (high.x - low.x); \
+                                     } \
                                  }
 
-ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t,
                                                int4 active_pixels,
                                                const float *ccl_restrict buffer,
                                                float4 *features,
+                                               bool use_time,
                                                const float4 *ccl_restrict mean,
                                                int pass_stride)
 {
+       int num_features = use_time? 11 : 10;
+
        features[0] = x;
        features[1] = y;
        features[2] = fabs(ccl_get_feature_sse(0));
@@ -50,18 +57,25 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
        features[7] = ccl_get_feature_sse(5);
        features[8] = ccl_get_feature_sse(6);
        features[9] = ccl_get_feature_sse(7);
+       if(use_time) {
+               features[10] = t;
+       }
+
        if(mean) {
-               for(int i = 0; i < DENOISE_FEATURES; i++)
+               for(int i = 0; i < num_features; i++) {
                        features[i] = features[i] - mean[i];
+               }
        }
-       for(int i = 0; i < DENOISE_FEATURES; i++)
+       for(int i = 0; i < num_features; i++) {
                features[i] = mask(active_pixels, features[i]);
+       }
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t,
                                                      int4 active_pixels,
                                                      const float *ccl_restrict buffer,
                                                      float4 *scales,
+                                                     bool use_time,
                                                      const float4 *ccl_restrict mean,
                                                      int pass_stride)
 {
@@ -75,15 +89,22 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
        scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
                    sqr(ccl_get_feature_sse(6) - mean[8]) +
                    sqr(ccl_get_feature_sse(7) - mean[9]);
-       for(int i = 0; i < 6; i++)
+       if(use_time) {
+               scales[6] = fabs(t - mean[10]);
+       }
+
+       for(int i = 0; i < (use_time? 7 : 6); i++)
                scales[i] = mask(active_pixels, scales[i]);
 }
 
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
 {
        scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
        scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
        scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+       if(use_time) {
+               scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));;
+       }
        scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
        scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
        scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
index 0c4387af540a3fc2879c5cd8040d0107de24d422..9eb3c603a4a87c939bc31edcf1ff4125f53e1ec8 100644 (file)
@@ -27,6 +27,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          int4 rect,
                                                          int stride,
                                                          int channel_offset,
+                                                         int frame_offset,
                                                          float a,
                                                          float k_2)
 {
@@ -39,7 +40,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
 
        for(int y = rect.y; y < rect.w; y++) {
                int idx_p = y*stride + aligned_lowx;
-               int idx_q = (y+dy)*stride + aligned_lowx + dx;
+               int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset;
                for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
                        float4 diff = make_float4(0.0f);
                        float4 scale_fac;
@@ -181,7 +182,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
        }
 }
 
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t,
                                                            const float *ccl_restrict difference_image,
                                                            const float *ccl_restrict buffer,
                                                            float *transform,
@@ -191,7 +192,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                                                            int4 rect,
                                                            int4 filter_window,
                                                            int stride, int f,
-                                                           int pass_stride)
+                                                           int pass_stride,
+                                                           int frame_offset,
+                                                           bool use_time)
 {
        int4 clip_area = rect_clip(rect, filter_window);
        /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
@@ -212,9 +215,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                        int    *l_rank = rank + storage_ofs;
 
                        kernel_filter_construct_gramian(x, y, 1,
-                                                       dx, dy,
+                                                       dx, dy, t,
                                                        stride,
                                                        pass_stride,
+                                                       frame_offset,
+                                                       use_time,
                                                        buffer,
                                                        l_transform, l_rank,
                                                        weight, l_XtWX, l_XtWY, 0);
index d8e2e4d08aa5f03b6837a827a19f20fd8f1ba318..12636393243d0c42db5c62d2887fde4d5f2e74be 100644 (file)
@@ -82,9 +82,10 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
                                                          ccl_global float *difference_image,
                                                          int4 rect, int stride,
                                                          int channel_offset,
+                                                         int frame_offset,
                                                          float a, float k_2)
 {
-       int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+       int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset;
        int numChannels = channel_offset? 3 : 1;
 
        float diff = 0.0f;
@@ -170,7 +171,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
 }
 
 ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
-                                                           int dx, int dy,
+                                                           int dx, int dy, int t,
                                                            const ccl_global float *ccl_restrict difference_image,
                                                            const ccl_global float *ccl_restrict buffer,
                                                            const ccl_global float *ccl_restrict transform,
@@ -181,6 +182,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
                                                            int4 filter_window,
                                                            int stride, int f,
                                                            int pass_stride,
+                                                           int frame_offset,
+                                                           bool use_time,
                                                            int localIdx)
 {
        const int low = max(rect.x, x-f);
@@ -201,9 +204,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
 
        kernel_filter_construct_gramian(x, y,
                                        rect_size(filter_window),
-                                       dx, dy,
+                                       dx, dy, t,
                                        stride,
                                        pass_stride,
+                                       frame_offset,
+                                       use_time,
                                        buffer,
                                        transform, rank,
                                        weight, XtWX, XtWY,
index e5d3b0da835b04d1e0b926724b62db6954f0ba18..31a7487c77ac52c3060f10228a662bbcb586e6ee 100644 (file)
@@ -18,9 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        int storage_stride,
-                                                       int dx, int dy,
+                                                       int dx, int dy, int t,
                                                        int buffer_stride,
                                                        int pass_stride,
+                                                       int frame_offset,
+                                                       bool use_time,
                                                        const ccl_global float *ccl_restrict buffer,
                                                        const ccl_global float *ccl_restrict transform,
                                                        ccl_global int *rank,
@@ -34,7 +36,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
        }
 
        int p_offset =  y     * buffer_stride +  x;
-       int q_offset = (y+dy) * buffer_stride + (x+dx);
+       int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset;
 
 #ifdef __KERNEL_GPU__
        const int stride = storage_stride;
@@ -57,9 +59,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                return;
        }
 
-       filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
-                                       make_int2(x+dx, y+dy), buffer + q_offset,
-                                       pass_stride, *rank, design_row, transform, stride);
+       filter_get_design_row_transform(make_int3(x, y, t),       buffer + p_offset,
+                                       make_int3(x+dx, y+dy, t), buffer + q_offset,
+                                       pass_stride, *rank, design_row, transform, stride, use_time);
 
 #ifdef __KERNEL_GPU__
        math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
index a5f87c05ec0b89ea20e1c886a916d132db8d6961..94e27bb02fda6c0539bf41402ea8c6c924d5850f 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  CCL_FILTER_TILE_INFO,
                                                   int x, int y, int4 rect,
-                                                  int pass_stride,
+                                                  int pass_stride, int frame_stride,
+                                                  bool use_time,
                                                   float *transform, int *rank,
                                                   int radius, float pca_threshold)
 {
@@ -26,59 +28,58 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        float features[DENOISE_FEATURES];
 
-       /* Temporary storage, used in different steps of the algorithm. */
-       float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
-       float tempvector[2*DENOISE_FEATURES];
        const float *ccl_restrict pixel_buffer;
-       int2 pixel;
+       int3 pixel;
+
+       int num_features = use_time? 11 : 10;
 
        /* === Calculate denoising window. === */
        int2 low  = make_int2(max(rect.x, x - radius),
                              max(rect.y, y - radius));
        int2 high = make_int2(min(rect.z, x + radius + 1),
                              min(rect.w, y + radius + 1));
-       int num_pixels = (high.y - low.y) * (high.x - low.x);
+       int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
 
        /* === Shift feature passes to have mean 0. === */
        float feature_means[DENOISE_FEATURES];
-       math_vector_zero(feature_means, DENOISE_FEATURES);
+       math_vector_zero(feature_means, num_features);
        FOR_PIXEL_WINDOW {
-               filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
-               math_vector_add(feature_means, features, DENOISE_FEATURES);
+               filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+               math_vector_add(feature_means, features, num_features);
        } END_FOR_PIXEL_WINDOW
 
-       math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+       math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
 
        /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
-       float *feature_scale = tempvector;
-       math_vector_zero(feature_scale, DENOISE_FEATURES);
+       float feature_scale[DENOISE_FEATURES];
+       math_vector_zero(feature_scale, num_features);
 
        FOR_PIXEL_WINDOW {
-               filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_max(feature_scale, features, DENOISE_FEATURES);
+               filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_max(feature_scale, features, num_features);
        } END_FOR_PIXEL_WINDOW
 
-       filter_calculate_scale(feature_scale);
+       filter_calculate_scale(feature_scale, use_time);
 
        /* === Generate the feature transformation. ===
-        * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+        * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
         * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
-       float* feature_matrix = tempmatrix;
-       math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+       float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+       math_matrix_zero(feature_matrix, num_features);
        FOR_PIXEL_WINDOW {
-               filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_mul(features, feature_scale, DENOISE_FEATURES);
-               math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+               filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_mul(features, feature_scale, num_features);
+               math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
        } END_FOR_PIXEL_WINDOW
 
-       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
        *rank = 0;
        /* Prevent overfitting when a small window is used. */
-       int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+       int max_rank = min(num_features, num_pixels/3);
        if(pca_threshold < 0.0f) {
                float threshold_energy = 0.0f;
-               for(int i = 0; i < DENOISE_FEATURES; i++) {
-                       threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+               for(int i = 0; i < num_features; i++) {
+                       threshold_energy += feature_matrix[i*num_features+i];
                }
                threshold_energy *= 1.0f - (-pca_threshold);
 
@@ -86,13 +87,13 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
                for(int i = 0; i < max_rank; i++, (*rank)++) {
                        if(i >= 2 && reduced_energy >= threshold_energy)
                                break;
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        reduced_energy += s;
                }
        }
        else {
                for(int i = 0; i < max_rank; i++, (*rank)++) {
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        if(i >= 2 && sqrtf(s) < pca_threshold)
                                break;
                }
@@ -100,9 +101,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        /* Bake the feature scaling into the transformation matrix. */
        for(int i = 0; i < (*rank); i++) {
-               math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+               math_vector_mul(transform + i*num_features, feature_scale, num_features);
        }
-       math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+       math_matrix_transpose(transform, num_features, 1);
 }
 
 CCL_NAMESPACE_END
index 83a1222bbdb25d2818bd1c4d63c4654f2830b145..ed8ddcb49b1e9eeeaa5d4a6b3c4ca0262f0247f0 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                  CCL_FILTER_TILE_INFO,
                                                   int x, int y, int4 rect,
-                                                  int pass_stride,
+                                                  int pass_stride, int frame_stride,
+                                                  bool use_time,
                                                   ccl_global float *transform,
                                                   ccl_global int *rank,
                                                   int radius, float pca_threshold,
@@ -33,60 +35,62 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
        float features[DENOISE_FEATURES];
 #endif
 
+       int num_features = use_time? 11 : 10;
+
        /* === Calculate denoising window. === */
        int2 low  = make_int2(max(rect.x, x - radius),
                              max(rect.y, y - radius));
        int2 high = make_int2(min(rect.z, x + radius + 1),
                              min(rect.w, y + radius + 1));
-       int num_pixels = (high.y - low.y) * (high.x - low.x);
+       int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
        const ccl_global float *ccl_restrict pixel_buffer;
-       int2 pixel;
+       int3 pixel;
 
 
 
 
        /* === Shift feature passes to have mean 0. === */
        float feature_means[DENOISE_FEATURES];
-       math_vector_zero(feature_means, DENOISE_FEATURES);
+       math_vector_zero(feature_means, num_features);
        FOR_PIXEL_WINDOW {
-               filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
-               math_vector_add(feature_means, features, DENOISE_FEATURES);
+               filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+               math_vector_add(feature_means, features, num_features);
        } END_FOR_PIXEL_WINDOW
 
-       math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+       math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
 
        /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
        float feature_scale[DENOISE_FEATURES];
-       math_vector_zero(feature_scale, DENOISE_FEATURES);
+       math_vector_zero(feature_scale, num_features);
 
        FOR_PIXEL_WINDOW {
-               filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_max(feature_scale, features, DENOISE_FEATURES);
+               filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_max(feature_scale, features, num_features);
        } END_FOR_PIXEL_WINDOW
 
-       filter_calculate_scale(feature_scale);
+       filter_calculate_scale(feature_scale, use_time);
 
 
 
        /* === Generate the feature transformation. ===
-        * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+        * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
         * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
        float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-       math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+       math_matrix_zero(feature_matrix, num_features);
        FOR_PIXEL_WINDOW {
-               filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_mul(features, feature_scale, DENOISE_FEATURES);
-               math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+               filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_mul(features, feature_scale, num_features);
+               math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
        } END_FOR_PIXEL_WINDOW
 
-       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
        *rank = 0;
        /* Prevent overfitting when a small window is used. */
-       int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+       int max_rank = min(num_features, num_pixels/3);
        if(pca_threshold < 0.0f) {
                float threshold_energy = 0.0f;
-               for(int i = 0; i < DENOISE_FEATURES; i++) {
-                       threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+               for(int i = 0; i < num_features; i++) {
+                       threshold_energy += feature_matrix[i*num_features+i];
                }
                threshold_energy *= 1.0f - (-pca_threshold);
 
@@ -94,24 +98,24 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
                for(int i = 0; i < max_rank; i++, (*rank)++) {
                        if(i >= 2 && reduced_energy >= threshold_energy)
                                break;
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        reduced_energy += s;
                }
        }
        else {
                for(int i = 0; i < max_rank; i++, (*rank)++) {
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        if(i >= 2 && sqrtf(s) < pca_threshold)
                                break;
                }
        }
 
-       math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+       math_matrix_transpose(transform, num_features, transform_stride);
 
        /* Bake the feature scaling into the transformation matrix. */
-       for(int i = 0; i < DENOISE_FEATURES; i++) {
+       for(int i = 0; i < num_features; i++) {
                for(int j = 0; j < (*rank); j++) {
-                       transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+                       transform[(i*num_features + j)*transform_stride] *= feature_scale[i];
                }
        }
 }
index 9e65f61664b6544de50abec160084e6fa049f020..10bd3e477e90d167c2d5c2457e5859800ee8ce00 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  CCL_FILTER_TILE_INFO,
                                                   int x, int y, int4 rect,
-                                                  int pass_stride,
+                                                  int pass_stride, int frame_stride,
+                                                  bool use_time,
                                                   float *transform, int *rank,
                                                   int radius, float pca_threshold)
 {
@@ -26,55 +28,63 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        float4 features[DENOISE_FEATURES];
        const float *ccl_restrict pixel_buffer;
-       int2 pixel;
+       int3 pixel;
 
+       int num_features = use_time? 11 : 10;
+
+       /* === Calculate denoising window. === */
        int2 low  = make_int2(max(rect.x, x - radius),
                              max(rect.y, y - radius));
        int2 high = make_int2(min(rect.z, x + radius + 1),
                              min(rect.w, y + radius + 1));
-       int num_pixels = (high.y - low.y) * (high.x - low.x);
+       int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
 
+       /* === Shift feature passes to have mean 0. === */
        float4 feature_means[DENOISE_FEATURES];
-       math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+       math_vector_zero_sse(feature_means, num_features);
        FOR_PIXEL_WINDOW_SSE {
-               filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
-               math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+               filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
+               math_vector_add_sse(feature_means, num_features, features);
        } END_FOR_PIXEL_WINDOW_SSE
 
        float4 pixel_scale = make_float4(1.0f / num_pixels);
-       for(int i = 0; i < DENOISE_FEATURES; i++) {
+       for(int i = 0; i < num_features; i++) {
                feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
        }
 
+       /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
        float4 feature_scale[DENOISE_FEATURES];
-       math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+       math_vector_zero_sse(feature_scale, num_features);
        FOR_PIXEL_WINDOW_SSE {
-               filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+               filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_max_sse(feature_scale, features, num_features);
        } END_FOR_PIXEL_WINDOW_SSE
 
-       filter_calculate_scale_sse(feature_scale);
+       filter_calculate_scale_sse(feature_scale, use_time);
 
+       /* === Generate the feature transformation. ===
+        * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+        * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
        float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
-       math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+       math_matrix_zero_sse(feature_matrix_sse, num_features);
        FOR_PIXEL_WINDOW_SSE {
-               filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
-               math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
+               filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_mul_sse(features, num_features, feature_scale);
+               math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
        } END_FOR_PIXEL_WINDOW_SSE
 
        float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-       math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+       math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
 
-       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
 
        *rank = 0;
        /* Prevent overfitting when a small window is used. */
-       int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+       int max_rank = min(num_features, num_pixels/3);
        if(pca_threshold < 0.0f) {
                float threshold_energy = 0.0f;
-               for(int i = 0; i < DENOISE_FEATURES; i++) {
-                       threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+               for(int i = 0; i < num_features; i++) {
+                       threshold_energy += feature_matrix[i*num_features+i];
                }
                threshold_energy *= 1.0f - (-pca_threshold);
 
@@ -82,23 +92,23 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
                for(int i = 0; i < max_rank; i++, (*rank)++) {
                        if(i >= 2 && reduced_energy >= threshold_energy)
                                break;
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        reduced_energy += s;
                }
        }
        else {
                for(int i = 0; i < max_rank; i++, (*rank)++) {
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        if(i >= 2 && sqrtf(s) < pca_threshold)
                                break;
                }
        }
 
-       math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+       math_matrix_transpose(transform, num_features, 1);
 
        /* Bake the feature scaling into the transformation matrix. */
-       for(int i = 0; i < DENOISE_FEATURES; i++) {
-               math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
+       for(int i = 0; i < num_features; i++) {
+               math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank);
        }
 }
 
index 08333c7a455c327c4e25b0054ec03065caaf63b8..02c85562db8db76e8855a0543d3baf27f1df0e35 100644 (file)
@@ -68,6 +68,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
                                                       int r);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           TileInfo *tiles,
                                                            int x,
                                                            int y,
                                                            int storage_ofs,
@@ -75,6 +76,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
                                                            int *rank,
                                                            int* rect,
                                                            int pass_stride,
+                                                           int frame_stride,
+                                                           bool use_time,
                                                            int radius,
                                                            float pca_threshold);
 
@@ -87,6 +90,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            int* rect,
                                                            int stride,
                                                            int channel_offset,
+                                                           int frame_offset,
                                                            float a,
                                                            float k_2);
 
@@ -116,6 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int dy,
+                                                             int t,
                                                              float *difference_image,
                                                              float *buffer,
                                                              float *transform,
@@ -126,7 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int *filter_window,
                                                              int stride,
                                                              int f,
-                                                             int pass_stride);
+                                                             int pass_stride,
+                                                             int frame_offset,
+                                                             bool use_time);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
                                                      float *accum_image,
index b792367e3abd02ba30ba09c029bdac54c7e9ce4b..c29505880cbd68249af1b2f5b38026f1d52bddcb 100644 (file)
@@ -135,6 +135,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           TileInfo *tile_info,
                                                            int x,
                                                            int y,
                                                            int storage_ofs,
@@ -142,6 +143,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
                                                            int *rank,
                                                            int* prefilter_rect,
                                                            int pass_stride,
+                                                           int frame_stride,
+                                                           bool use_time,
                                                            int radius,
                                                            float pca_threshold)
 {
@@ -151,9 +154,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
        rank += storage_ofs;
        transform += storage_ofs*TRANSFORM_SIZE;
        kernel_filter_construct_transform(buffer,
+                                         tile_info,
                                          x, y,
                                          load_int4(prefilter_rect),
                                          pass_stride,
+                                         frame_stride,
+                                         use_time,
                                          transform,
                                          rank,
                                          radius,
@@ -170,6 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            int *rect,
                                                            int stride,
                                                            int channel_offset,
+                                                           int frame_offset,
                                                            float a,
                                                            float k_2)
 {
@@ -184,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                          load_int4(rect),
                                          stride,
                                          channel_offset,
+                                         frame_offset,
                                          a, k_2);
 #endif
 }
@@ -243,6 +251,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int dy,
+                                                             int t,
                                                              float *difference_image,
                                                              float *buffer,
                                                              float *transform,
@@ -253,12 +262,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int *filter_window,
                                                              int stride,
                                                              int f,
-                                                             int pass_stride)
+                                                             int pass_stride,
+                                                             int frame_offset,
+                                                             bool use_time)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
 #else
-       kernel_filter_nlm_construct_gramian(dx, dy,
+       kernel_filter_nlm_construct_gramian(dx, dy, t,
                                            difference_image,
                                            buffer,
                                            transform, rank,
@@ -266,7 +277,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                            load_int4(rect),
                                            load_int4(filter_window),
                                            stride, f,
-                                           pass_stride);
+                                           pass_stride,
+                                           frame_offset,
+                                           use_time);
 #endif
 }
 
index 3b51bb41aed13e70603438ef377587b32638db69..5b552b014132a8f3cfe5f501601d5e37eb4dfda7 100644 (file)
@@ -29,7 +29,7 @@
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_divide_shadow(int sample,
-                                 TileInfo *tile_info,
+                                 CCL_FILTER_TILE_INFO,
                                  float *unfilteredA,
                                  float *unfilteredB,
                                  float *sampleVariance,
@@ -59,7 +59,7 @@ kernel_cuda_filter_divide_shadow(int sample,
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_get_feature(int sample,
-                               TileInfo *tile_info,
+                               CCL_FILTER_TILE_INFO,
                                int m_offset,
                                int v_offset,
                                float *mean,
@@ -138,10 +138,12 @@ kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+                                       CCL_FILTER_TILE_INFO,
                                        float *transform, int *rank,
                                        int4 filter_area, int4 rect,
                                        int radius, float pca_threshold,
-                                       int pass_stride)
+                                       int pass_stride, int frame_stride,
+                                       bool use_time)
 {
        int x = blockDim.x*blockIdx.x + threadIdx.x;
        int y = blockDim.y*blockIdx.y + threadIdx.y;
@@ -149,8 +151,11 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
                int *l_rank = rank + y*filter_area.z + x;
                float *l_transform = transform + y*filter_area.z + x;
                kernel_filter_construct_transform(buffer,
+                                                 tile_info,
                                                  x + filter_area.x, y + filter_area.y,
-                                                 rect, pass_stride,
+                                                 rect,
+                                                 pass_stride, frame_stride,
+                                                 use_time,
                                                  l_transform, l_rank,
                                                  radius, pca_threshold,
                                                  filter_area.z*filter_area.w,
@@ -170,6 +175,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        int pass_stride,
                                        int r,
                                        int channel_offset,
+                                       int frame_offset,
                                        float a,
                                        float k_2)
 {
@@ -183,6 +189,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                                  difference_image + ofs,
                                                  rect, stride,
                                                  channel_offset,
+                                                 frame_offset,
                                                  a, k_2);
        }
 }
@@ -274,7 +281,8 @@ kernel_cuda_filter_nlm_normalize(float *out_image,
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_construct_gramian(int t,
+                                         const float *ccl_restrict difference_image,
                                          const float *ccl_restrict buffer,
                                          float const* __restrict__ transform,
                                          int *rank,
@@ -286,13 +294,16 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                          int stride,
                                          int pass_stride,
                                          int r,
-                                         int f)
+                                         int f,
+                                         int frame_offset,
+                                         bool use_time)
 {
        int4 co, rect;
        int ofs;
        if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
                kernel_filter_nlm_construct_gramian(co.x, co.y,
                                                    co.z, co.w,
+                                                   t,
                                                    difference_image + ofs,
                                                    buffer,
                                                    transform, rank,
@@ -300,6 +311,8 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                                    rect, filter_window,
                                                    stride, f,
                                                    pass_stride,
+                                                   frame_offset,
+                                                   use_time,
                                                    threadIdx.y*blockDim.x + threadIdx.x);
        }
 }
index 8a821ee281d08223091b4abd186874d2dcef5e2b..996bc27f71b249fe7be649c264fffd4c3c48ed11 100644 (file)
@@ -127,11 +127,14 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
 }
 
 __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                    CCL_FILTER_TILE_INFO,
                                                     ccl_global float *transform,
                                                     ccl_global int *rank,
                                                     int4 filter_area,
                                                     int4 rect,
                                                     int pass_stride,
+                                                    int frame_stride,
+                                                    char use_time,
                                                     int radius,
                                                     float pca_threshold)
 {
@@ -141,8 +144,11 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
                ccl_global int *l_rank = rank + y*filter_area.z + x;
                ccl_global float *l_transform = transform + y*filter_area.z + x;
                kernel_filter_construct_transform(buffer,
+                                                 CCL_FILTER_TILE_INFO_ARG,
                                                  x + filter_area.x, y + filter_area.y,
-                                                 rect, pass_stride,
+                                                 rect,
+                                                 pass_stride, frame_stride,
+                                                 use_time,
                                                  l_transform, l_rank,
                                                  radius, pca_threshold,
                                                  filter_area.z*filter_area.w,
@@ -160,6 +166,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                     int pass_stride,
                                                     int r,
                                                     int channel_offset,
+                                                    int frame_offset,
                                                     float a,
                                                     float k_2)
 {
@@ -173,6 +180,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                  difference_image + ofs,
                                                  rect, stride,
                                                  channel_offset,
+                                                 frame_offset,
                                                  a, k_2);
        }
 }
@@ -254,7 +262,8 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
        }
 }
 
-__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
+                                                      const ccl_global float *ccl_restrict difference_image,
                                                       const ccl_global float *ccl_restrict buffer,
                                                       const ccl_global float *ccl_restrict transform,
                                                       ccl_global int *rank,
@@ -266,13 +275,16 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                       int stride,
                                                       int pass_stride,
                                                       int r,
-                                                      int f)
+                                                      int f,
+                                                      int frame_offset,
+                                                      char use_time)
 {
        int4 co, rect;
        int ofs;
        if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
                kernel_filter_nlm_construct_gramian(co.x, co.y,
                                                    co.z, co.w,
+                                                   t,
                                                    difference_image + ofs,
                                                    buffer,
                                                    transform, rank,
@@ -280,6 +292,8 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                    rect, filter_window,
                                                    stride, f,
                                                    pass_stride,
+                                                   frame_offset,
+                                                   use_time,
                                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
        }
 }