Cycles: animation denoising support in the kernel.
authorLukas Stockner <lukas.stockner@freenet.de>
Wed, 6 Feb 2019 13:19:20 +0000 (14:19 +0100)
committerBrecht Van Lommel <brechtvanlommel@gmail.com>
Wed, 6 Feb 2019 14:18:42 +0000 (15:18 +0100)
This is the internal implementation, not available from the API or
interface yet. The algorithm takes into account past and future frames,
both to get more coherent animation and reduce noise.

Ref D3889.

20 files changed:
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_denoising.cpp
intern/cycles/device/device_denoising.h
intern/cycles/device/device_task.h
intern/cycles/device/opencl/opencl.h
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/kernel/filter/filter_defines.h
intern/cycles/kernel/filter/filter_features.h
intern/cycles/kernel/filter/filter_features_sse.h
intern/cycles/kernel/filter/filter_nlm_cpu.h
intern/cycles/kernel/filter/filter_nlm_gpu.h
intern/cycles/kernel/filter/filter_reconstruction.h
intern/cycles/kernel/filter/filter_transform.h
intern/cycles/kernel/filter/filter_transform_gpu.h
intern/cycles/kernel/filter/filter_transform_sse.h
intern/cycles/kernel/kernels/cpu/filter_cpu.h
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
intern/cycles/kernel/kernels/cuda/filter.cu
intern/cycles/kernel/kernels/opencl/filter.cl

index 6668acc..93c63b9 100644 (file)
@@ -186,15 +186,15 @@ public:
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
 
-       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)>      filter_nlm_calc_difference_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel;
        KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_blur_kernel;
        KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_calc_weight_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)>       filter_nlm_update_output_kernel;
        KernelFunctions<void(*)(float*, float*, int*, int)>                                                   filter_nlm_normalize_kernel;
 
-       KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
-       KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
-       KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
+       KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)>                   filter_construct_transform_kernel;
+       KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel;
+       KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                                            filter_finalize_kernel;
 
        KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
                               int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -512,7 +512,7 @@ public:
                                                            difference,
                                                            local_rect,
                                                            w, channel_offset,
-                                                           a, k_2);
+                                                           0, a, k_2);
 
                        filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
                        filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
@@ -542,6 +542,7 @@ public:
                for(int y = 0; y < task->filter_area.w; y++) {
                        for(int x = 0; x < task->filter_area.z; x++) {
                                filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+                                                                   task->tile_info,
                                                                    x + task->filter_area.x,
                                                                    y + task->filter_area.y,
                                                                    y*task->filter_area.z + x,
@@ -549,6 +550,8 @@ public:
                                                                    (int*)   task->storage.rank.device_pointer,
                                                                    &task->rect.x,
                                                                    task->buffer.pass_stride,
+                                                                   task->buffer.frame_stride,
+                                                                   task->buffer.use_time,
                                                                    task->radius,
                                                                    task->pca_threshold);
                        }
@@ -559,6 +562,7 @@ public:
        bool denoising_accumulate(device_ptr color_ptr,
                                  device_ptr color_variance_ptr,
                                  device_ptr scale_ptr,
+                                 int frame,
                                  DenoisingTask *task)
        {
                ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
@@ -568,6 +572,7 @@ public:
                float *blurDifference = temporary_mem + task->buffer.pass_stride;
 
                int r = task->radius;
+               int frame_offset = frame * task->buffer.frame_stride;
                for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
                        int dy = i / (2*r+1) - r;
                        int dx = i % (2*r+1) - r;
@@ -583,12 +588,14 @@ public:
                                                            local_rect,
                                                            task->buffer.stride,
                                                            task->buffer.pass_stride,
+                                                           frame_offset,
                                                            1.0f,
                                                            task->nlm_k_2);
                        filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
                        filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
                        filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
                        filter_nlm_construct_gramian_kernel()(dx, dy,
+                                                             task->tile_info->frames[frame],
                                                              blurDifference,
                                                              (float*)  task->buffer.mem.device_pointer,
                                                              (float*)  task->storage.transform.device_pointer,
@@ -599,7 +606,9 @@ public:
                                                              &task->reconstruction_state.filter_window.x,
                                                              task->buffer.stride,
                                                              4,
-                                                             task->buffer.pass_stride);
+                                                             task->buffer.pass_stride,
+                                                             frame_offset,
+                                                             task->buffer.use_time);
                }
 
                return true;
@@ -787,7 +796,7 @@ public:
                tile.sample = tile.start_sample + tile.num_samples;
 
                denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
-               denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+               denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
                denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
                denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
                denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
index cb7d8bb..e21d974 100644 (file)
@@ -1301,6 +1301,7 @@ public:
                int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
                int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
+               int frame_offset = 0;
 
                if(have_error())
                        return false;
@@ -1327,7 +1328,7 @@ public:
 
                        CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
 
-                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
                        void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
                        void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
                        void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
@@ -1367,13 +1368,16 @@ public:
                                   task->storage.h);
 
                void *args[] = {&task->buffer.mem.device_pointer,
+                               &task->tile_info_mem.device_pointer,
                                &task->storage.transform.device_pointer,
                                &task->storage.rank.device_pointer,
                                &task->filter_area,
                                &task->rect,
                                &task->radius,
                                &task->pca_threshold,
-                               &task->buffer.pass_stride};
+                               &task->buffer.pass_stride,
+                               &task->buffer.frame_stride,
+                               &task->buffer.use_time};
                CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
                cuda_assert(cuCtxSynchronize());
 
@@ -1383,6 +1387,7 @@ public:
        bool denoising_accumulate(device_ptr color_ptr,
                                  device_ptr color_variance_ptr,
                                  device_ptr scale_ptr,
+                                 int frame,
                                  DenoisingTask *task)
        {
                if(have_error())
@@ -1398,6 +1403,8 @@ public:
                int w = task->reconstruction_state.source_w;
                int h = task->reconstruction_state.source_h;
                int stride = task->buffer.stride;
+               int frame_offset = frame * task->buffer.frame_stride;
+               int t = task->tile_info->frames[frame];
 
                int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
@@ -1430,10 +1437,12 @@ public:
                                                &w, &h,
                                                &stride, &pass_stride,
                                                &r, &pass_stride,
+                                               &frame_offset,
                                                &a, &k_2};
                void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
                void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-               void *construct_gramian_args[] = {&blurDifference,
+               void *construct_gramian_args[] = {&t,
+                                                 &blurDifference,
                                                  &task->buffer.mem.device_pointer,
                                                  &task->storage.transform.device_pointer,
                                                  &task->storage.rank.device_pointer,
@@ -1442,7 +1451,9 @@ public:
                                                  &task->reconstruction_state.filter_window,
                                                  &w, &h, &stride,
                                                  &pass_stride, &r,
-                                                 &f};
+                                                 &f,
+                                                 &frame_offset,
+                                                 &task->buffer.use_time};
 
                CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
                CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1635,7 +1646,7 @@ public:
        void denoise(RenderTile &rtile, DenoisingTask& denoising)
        {
                denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
-               denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, &denoising);
+               denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
                denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
                denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
                denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
index 724171c..61e0ba4 100644 (file)
@@ -36,6 +36,7 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
                pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
        }
 
+       render_buffer.frame_stride = task.frame_stride;
        render_buffer.pass_stride = task.pass_stride;
        render_buffer.offset = task.pass_denoising_data;
 
@@ -49,6 +50,12 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
        tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
        tile_info->from_render = task.denoising_from_render? 1 : 0;
 
+       tile_info->frames[0] = 0;
+       tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
+       for(int i = 1; i < tile_info->num_frames; i++) {
+               tile_info->frames[i] = task.denoising_frames[i-1];
+       }
+
        write_passes = task.denoising_write_passes;
        do_filter = task.denoising_do_filter;
 }
@@ -101,16 +108,18 @@ void DenoisingTask::setup_denoising_buffer()
        rect = rect_expand(rect, radius);
        rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
 
-       buffer.use_intensity = write_passes;
+       buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
        buffer.passes = buffer.use_intensity? 15 : 14;
        buffer.width = rect.z - rect.x;
        buffer.stride = align_up(buffer.width, 4);
        buffer.h = rect.w - rect.y;
        int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
        buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
+       buffer.frame_stride = buffer.pass_stride * buffer.passes;
        /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
-       int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats);
+       int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
        buffer.mem.alloc_to_device(mem_size, false);
+       buffer.use_time = (tile_info->num_frames > 1);
 
        /* CPUs process shifts sequentially while GPUs process them in parallel. */
        int num_layers;
@@ -216,6 +225,25 @@ void DenoisingTask::prefilter_color()
        }
 }
 
+void DenoisingTask::load_buffer()
+{
+       device_ptr null_ptr = (device_ptr) 0;
+
+       int original_offset = render_buffer.offset;
+
+       int num_passes = buffer.use_intensity? 15 : 14;
+       for(int i = 0; i < tile_info->num_frames; i++) {
+               for(int pass = 0; pass < num_passes; pass++) {
+                       device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride);
+                       bool is_variance = (pass >= 11) && (pass <= 13);
+                       functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f);
+               }
+               render_buffer.offset += render_buffer.frame_stride;
+       }
+
+       render_buffer.offset = original_offset;
+}
+
 void DenoisingTask::write_buffer()
 {
        reconstruction_state.buffer_params = make_int4(target_buffer.offset,
@@ -259,11 +287,17 @@ void DenoisingTask::reconstruct()
 
        device_sub_ptr color_ptr    (buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride);
        device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
-
-       device_ptr scale_ptr = 0;
-       device_sub_ptr *scale_sub_ptr = NULL;
-       functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr);
-       delete scale_sub_ptr;
+       for(int f = 0; f < tile_info->num_frames; f++) {
+               device_ptr scale_ptr = 0;
+               device_sub_ptr *scale_sub_ptr = NULL;
+               if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
+                       scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
+                       scale_ptr = **scale_sub_ptr;
+               }
+
+               functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
+               delete scale_sub_ptr;
+       }
        functions.solve(target_buffer.ptr);
 }
 
@@ -276,9 +310,14 @@ void DenoisingTask::run_denoising(RenderTile *tile)
 
        setup_denoising_buffer();
 
-       prefilter_shadowing();
-       prefilter_features();
-       prefilter_color();
+       if(tile_info->from_render) {
+               prefilter_shadowing();
+               prefilter_features();
+               prefilter_color();
+       }
+       else {
+               load_buffer();
+       }
 
        if(do_filter) {
                construct_transform();
index cddcd3b..5869aa0 100644 (file)
@@ -38,6 +38,7 @@ public:
        struct RenderBuffers {
                int offset;
                int pass_stride;
+               int frame_stride;
                int samples;
        } render_buffer;
 
@@ -70,7 +71,8 @@ public:
                              )> non_local_means;
                function<bool(device_ptr color_ptr,
                              device_ptr color_variance_ptr,
-                             device_ptr scale_ptr
+                             device_ptr scale_ptr,
+                             int frame
                              )> accumulate;
                function<bool(device_ptr output_ptr)> solve;
                function<bool()> construct_transform;
@@ -156,8 +158,10 @@ public:
                int stride;
                int h;
                int width;
+               int frame_stride;
                device_only_memory<float> mem;
                device_only_memory<float> temporary_mem;
+               bool use_time;
                bool use_intensity;
 
                bool gpu_temporary_mem;
@@ -179,6 +183,7 @@ protected:
        void construct_transform();
        void reconstruct();
 
+       void load_buffer();
        void write_buffer();
 };
 
index 97bcde9..2871bc5 100644 (file)
@@ -73,11 +73,13 @@ public:
        float denoising_feature_strength;
        bool denoising_relative_pca;
        bool denoising_from_render;
+       vector<int> denoising_frames;
 
        bool denoising_do_filter;
        bool denoising_write_passes;
 
        int pass_stride;
+       int frame_stride;
        int target_pass_stride;
        int pass_denoising_data;
        int pass_denoising_clean;
index 4d42ddc..9b76316 100644 (file)
@@ -422,6 +422,7 @@ protected:
        bool denoising_accumulate(device_ptr color_ptr,
                                  device_ptr color_variance_ptr,
                                  device_ptr scale_ptr,
+                                 int frame,
                                  DenoisingTask *task);
        bool denoising_solve(device_ptr output_ptr,
                             DenoisingTask *task);
index a0a1cf6..4417065 100644 (file)
@@ -821,16 +821,31 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
        cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
        cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
        cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+       cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+       char use_time = task->buffer.use_time? 1 : 0;
 
        cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
 
-       kernel_set_args(ckFilterConstructTransform, 0,
-                       buffer_mem,
+       int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
+                                     buffer_mem,
+                                     tile_info_mem);
+       cl_mem buffers[9];
+       for(int i = 0; i < 9; i++) {
+               buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+               arg_ofs += kernel_set_args(ckFilterConstructTransform,
+                                          arg_ofs,
+                                          buffers[i]);
+       }
+       kernel_set_args(ckFilterConstructTransform,
+                       arg_ofs,
                        transform_mem,
                        rank_mem,
                        task->filter_area,
                        task->rect,
                        task->buffer.pass_stride,
+                       task->buffer.frame_stride,
+                       use_time,
                        task->radius,
                        task->pca_threshold);
 
@@ -845,6 +860,7 @@ bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
 bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
                                             device_ptr color_variance_ptr,
                                             device_ptr scale_ptr,
+                                            int frame,
                                             DenoisingTask *task)
 {
        cl_mem color_mem = CL_MEM_PTR(color_ptr);
@@ -865,6 +881,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
        int w = task->reconstruction_state.source_w;
        int h = task->reconstruction_state.source_h;
        int stride = task->buffer.stride;
+       int frame_offset = frame * task->buffer.frame_stride;
+       int t = task->tile_info->frames[frame];
+       char use_time = task->buffer.use_time? 1 : 0;
 
        int r = task->radius;
        int pass_stride = task->buffer.pass_stride;
@@ -884,6 +903,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
                        pass_stride,
                        r,
                        pass_stride,
+                       frame_offset,
                        1.0f, task->nlm_k_2);
        kernel_set_args(ckNLMBlur, 0,
                        difference_mem,
@@ -898,6 +918,7 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
                        pass_stride,
                        r, 4);
        kernel_set_args(ckNLMConstructGramian, 0,
+                       t,
                        blurDifference_mem,
                        buffer_mem,
                        transform_mem,
@@ -907,7 +928,9 @@ bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
                        task->reconstruction_state.filter_window,
                        w, h, stride,
                        pass_stride,
-                       r, 4);
+                       r, 4,
+                       frame_offset,
+                       use_time);
 
        enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
        enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
@@ -1108,7 +1131,7 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
 void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
 {
        denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
-       denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, &denoising);
+       denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
        denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
        denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
        denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
index 9ac7c3d..cb04aac 100644 (file)
 #ifndef __FILTER_DEFINES_H__
 #define __FILTER_DEFINES_H__
 
-#define DENOISE_FEATURES 10
+#define DENOISE_FEATURES 11
 #define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
 #define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
 #define XTWY_SIZE      (DENOISE_FEATURES+1)
 
+#define DENOISE_MAX_FRAMES 16
+
 typedef struct TileInfo {
        int offsets[9];
        int strides[9];
        int x[4];
        int y[4];
        int from_render;
+       int frames[DENOISE_MAX_FRAMES];
+       int num_frames;
        /* TODO(lukas): CUDA doesn't have uint64_t... */
 #ifdef __KERNEL_OPENCL__
        ccl_global float *buffers[9];
index 6226ed2..e1ea648 100644 (file)
 
 #define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
 
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
- * pixel_buffer always points to the current pixel in the first pass. */
-#define FOR_PIXEL_WINDOW     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
-                             for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                 for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass.
+ * Repeat the loop for every secondary frame if there are any. */
+#define FOR_PIXEL_WINDOW     for(int frame = 0; frame < tile_info->num_frames; frame++) { \
+                                 pixel.z = tile_info->frames[frame]; \
+                                 pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
+                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
 
-#define END_FOR_PIXEL_WINDOW     } \
-                                 pixel_buffer += buffer_w - (high.x - low.x); \
+#define END_FOR_PIXEL_WINDOW         } \
+                                     pixel_buffer += buffer_w - (high.x - low.x); \
+                                 } \
                              }
 
-ccl_device_inline void filter_get_features(int2 pixel,
+ccl_device_inline void filter_get_features(int3 pixel,
                                            const ccl_global float *ccl_restrict buffer,
                                            float *features,
+                                           bool use_time,
                                            const float *ccl_restrict mean,
                                            int pass_stride)
 {
@@ -44,15 +48,20 @@ ccl_device_inline void filter_get_features(int2 pixel,
        features[7] = ccl_get_feature(buffer, 5);
        features[8] = ccl_get_feature(buffer, 6);
        features[9] = ccl_get_feature(buffer, 7);
+       if(use_time) {
+               features[10] = pixel.z;
+       }
        if(mean) {
-               for(int i = 0; i < DENOISE_FEATURES; i++)
+               for(int i = 0; i < (use_time? 11 : 10); i++) {
                        features[i] -= mean[i];
+               }
        }
 }
 
-ccl_device_inline void filter_get_feature_scales(int2 pixel,
+ccl_device_inline void filter_get_feature_scales(int3 pixel,
                                                  const ccl_global float *ccl_restrict buffer,
                                                  float *scales,
+                                                 bool use_time,
                                                  const float *ccl_restrict mean,
                                                  int pass_stride)
 {
@@ -66,13 +75,19 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel,
        scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
                                            ccl_get_feature(buffer, 6) - mean[8],
                                            ccl_get_feature(buffer, 7) - mean[9]));
+       if(use_time) {
+               scales[6] = fabsf(pixel.z - mean[10]);
+       }
 }
 
-ccl_device_inline void filter_calculate_scale(float *scale)
+ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
 {
        scale[0] = 1.0f/max(scale[0], 0.01f);
        scale[1] = 1.0f/max(scale[1], 0.01f);
        scale[2] = 1.0f/max(scale[2], 0.01f);
+       if(use_time) {
+               scale[10] = 1.0f/max(scale[6], 0.01f);
+       }
        scale[6] = 1.0f/max(scale[4], 0.01f);
        scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
        scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
@@ -89,36 +104,46 @@ ccl_device_inline void design_row_add(float *design_row,
                                       const ccl_global float *ccl_restrict transform,
                                       int stride,
                                       int row,
-                                      float feature)
+                                      float feature,
+                                      int transform_row_stride)
 {
        for(int i = 0; i < rank; i++) {
-               design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+               design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature;
        }
 }
 
 /* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ccl_device_inline void filter_get_design_row_transform(int3 p_pixel,
                                                        const ccl_global float *ccl_restrict p_buffer,
-                                                       int2 q_pixel,
+                                                       int3 q_pixel,
                                                        const ccl_global float *ccl_restrict q_buffer,
                                                        int pass_stride,
                                                        int rank,
                                                        float *design_row,
                                                        const ccl_global float *ccl_restrict transform,
-                                                       int stride)
+                                                       int stride,
+                                                       bool use_time)
 {
+       int num_features = use_time? 11 : 10;
+
        design_row[0] = 1.0f;
        math_vector_zero(design_row+1, rank);
-       design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
-       design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
-       design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
-       design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
-       design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
-       design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
-       design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
-       design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
-       design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
-       design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+
+#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features);
+       DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
+       DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
+       DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+       DESIGN_ROW_ADD(3,       ccl_get_feature(q_buffer, 1)  -       ccl_get_feature(p_buffer, 1));
+       DESIGN_ROW_ADD(4,       ccl_get_feature(q_buffer, 2)  -       ccl_get_feature(p_buffer, 2));
+       DESIGN_ROW_ADD(5,       ccl_get_feature(q_buffer, 3)  -       ccl_get_feature(p_buffer, 3));
+       DESIGN_ROW_ADD(6,       ccl_get_feature(q_buffer, 4)  -       ccl_get_feature(p_buffer, 4));
+       DESIGN_ROW_ADD(7,       ccl_get_feature(q_buffer, 5)  -       ccl_get_feature(p_buffer, 5));
+       DESIGN_ROW_ADD(8,       ccl_get_feature(q_buffer, 6)  -       ccl_get_feature(p_buffer, 6));
+       DESIGN_ROW_ADD(9,       ccl_get_feature(q_buffer, 7)  -       ccl_get_feature(p_buffer, 7));
+       if(use_time) {
+               DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
+       }
+#undef DESIGN_ROW_ADD
 }
 
 CCL_NAMESPACE_END
index 3ddd871..5dd001f 100644 (file)
@@ -20,26 +20,33 @@ CCL_NAMESPACE_BEGIN
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window.
+ * Repeat the loop for every secondary frame if there are any. */
+#define FOR_PIXEL_WINDOW_SSE     for(int frame = 0; frame < tile_info->num_frames; frame++) { \
+                                     pixel.z = tile_info->frames[frame]; \
+                                     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
+                                     float4 t4 = make_float4(pixel.z); \
+                                     for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                         float4 y4 = make_float4(pixel.y); \
+                                         for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+                                             float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                             int4 active_pixels = x4 < make_float4(high.x);
 
-#define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
-                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                     float4 y4 = make_float4(pixel.y); \
-                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
-                                         int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE     } \
-                                     pixel_buffer += buffer_w - (pixel.x - low.x); \
+#define END_FOR_PIXEL_WINDOW_SSE         } \
+                                         pixel_buffer += buffer_w - (high.x - low.x); \
+                                     } \
                                  }
 
-ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t,
                                                int4 active_pixels,
                                                const float *ccl_restrict buffer,
                                                float4 *features,
+                                               bool use_time,
                                                const float4 *ccl_restrict mean,
                                                int pass_stride)
 {
+       int num_features = use_time? 11 : 10;
+
        features[0] = x;
        features[1] = y;
        features[2] = fabs(ccl_get_feature_sse(0));
@@ -50,18 +57,25 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
        features[7] = ccl_get_feature_sse(5);
        features[8] = ccl_get_feature_sse(6);
        features[9] = ccl_get_feature_sse(7);
+       if(use_time) {
+               features[10] = t;
+       }
+
        if(mean) {
-               for(int i = 0; i < DENOISE_FEATURES; i++)
+               for(int i = 0; i < num_features; i++) {
                        features[i] = features[i] - mean[i];
+               }
        }
-       for(int i = 0; i < DENOISE_FEATURES; i++)
+       for(int i = 0; i < num_features; i++) {
                features[i] = mask(active_pixels, features[i]);
+       }
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t,
                                                      int4 active_pixels,
                                                      const float *ccl_restrict buffer,
                                                      float4 *scales,
+                                                     bool use_time,
                                                      const float4 *ccl_restrict mean,
                                                      int pass_stride)
 {
@@ -75,15 +89,22 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
        scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
                    sqr(ccl_get_feature_sse(6) - mean[8]) +
                    sqr(ccl_get_feature_sse(7) - mean[9]);
-       for(int i = 0; i < 6; i++)
+       if(use_time) {
+               scales[6] = fabs(t - mean[10]);
+       }
+
+       for(int i = 0; i < (use_time? 7 : 6); i++)
                scales[i] = mask(active_pixels, scales[i]);
 }
 
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
 {
        scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
        scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
        scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+       if(use_time) {
+               scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));;
+       }
        scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
        scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
        scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
index 0c4387a..9eb3c60 100644 (file)
@@ -27,6 +27,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          int4 rect,
                                                          int stride,
                                                          int channel_offset,
+                                                         int frame_offset,
                                                          float a,
                                                          float k_2)
 {
@@ -39,7 +40,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
 
        for(int y = rect.y; y < rect.w; y++) {
                int idx_p = y*stride + aligned_lowx;
-               int idx_q = (y+dy)*stride + aligned_lowx + dx;
+               int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset;
                for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
                        float4 diff = make_float4(0.0f);
                        float4 scale_fac;
@@ -181,7 +182,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
        }
 }
 
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t,
                                                            const float *ccl_restrict difference_image,
                                                            const float *ccl_restrict buffer,
                                                            float *transform,
@@ -191,7 +192,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                                                            int4 rect,
                                                            int4 filter_window,
                                                            int stride, int f,
-                                                           int pass_stride)
+                                                           int pass_stride,
+                                                           int frame_offset,
+                                                           bool use_time)
 {
        int4 clip_area = rect_clip(rect, filter_window);
        /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
@@ -212,9 +215,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                        int    *l_rank = rank + storage_ofs;
 
                        kernel_filter_construct_gramian(x, y, 1,
-                                                       dx, dy,
+                                                       dx, dy, t,
                                                        stride,
                                                        pass_stride,
+                                                       frame_offset,
+                                                       use_time,
                                                        buffer,
                                                        l_transform, l_rank,
                                                        weight, l_XtWX, l_XtWY, 0);
index d8e2e4d..1263639 100644 (file)
@@ -82,9 +82,10 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
                                                          ccl_global float *difference_image,
                                                          int4 rect, int stride,
                                                          int channel_offset,
+                                                         int frame_offset,
                                                          float a, float k_2)
 {
-       int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+       int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset;
        int numChannels = channel_offset? 3 : 1;
 
        float diff = 0.0f;
@@ -170,7 +171,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
 }
 
 ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
-                                                           int dx, int dy,
+                                                           int dx, int dy, int t,
                                                            const ccl_global float *ccl_restrict difference_image,
                                                            const ccl_global float *ccl_restrict buffer,
                                                            const ccl_global float *ccl_restrict transform,
@@ -181,6 +182,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
                                                            int4 filter_window,
                                                            int stride, int f,
                                                            int pass_stride,
+                                                           int frame_offset,
+                                                           bool use_time,
                                                            int localIdx)
 {
        const int low = max(rect.x, x-f);
@@ -201,9 +204,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
 
        kernel_filter_construct_gramian(x, y,
                                        rect_size(filter_window),
-                                       dx, dy,
+                                       dx, dy, t,
                                        stride,
                                        pass_stride,
+                                       frame_offset,
+                                       use_time,
                                        buffer,
                                        transform, rank,
                                        weight, XtWX, XtWY,
index e5d3b0d..31a7487 100644 (file)
@@ -18,9 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        int storage_stride,
-                                                       int dx, int dy,
+                                                       int dx, int dy, int t,
                                                        int buffer_stride,
                                                        int pass_stride,
+                                                       int frame_offset,
+                                                       bool use_time,
                                                        const ccl_global float *ccl_restrict buffer,
                                                        const ccl_global float *ccl_restrict transform,
                                                        ccl_global int *rank,
@@ -34,7 +36,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
        }
 
        int p_offset =  y     * buffer_stride +  x;
-       int q_offset = (y+dy) * buffer_stride + (x+dx);
+       int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset;
 
 #ifdef __KERNEL_GPU__
        const int stride = storage_stride;
@@ -57,9 +59,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                return;
        }
 
-       filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
-                                       make_int2(x+dx, y+dy), buffer + q_offset,
-                                       pass_stride, *rank, design_row, transform, stride);
+       filter_get_design_row_transform(make_int3(x, y, t),       buffer + p_offset,
+                                       make_int3(x+dx, y+dy, t), buffer + q_offset,
+                                       pass_stride, *rank, design_row, transform, stride, use_time);
 
 #ifdef __KERNEL_GPU__
        math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
index a5f87c0..94e27bb 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  CCL_FILTER_TILE_INFO,
                                                   int x, int y, int4 rect,
-                                                  int pass_stride,
+                                                  int pass_stride, int frame_stride,
+                                                  bool use_time,
                                                   float *transform, int *rank,
                                                   int radius, float pca_threshold)
 {
@@ -26,59 +28,58 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        float features[DENOISE_FEATURES];
 
-       /* Temporary storage, used in different steps of the algorithm. */
-       float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
-       float tempvector[2*DENOISE_FEATURES];
        const float *ccl_restrict pixel_buffer;
-       int2 pixel;
+       int3 pixel;
+
+       int num_features = use_time? 11 : 10;
 
        /* === Calculate denoising window. === */
        int2 low  = make_int2(max(rect.x, x - radius),
                              max(rect.y, y - radius));
        int2 high = make_int2(min(rect.z, x + radius + 1),
                              min(rect.w, y + radius + 1));
-       int num_pixels = (high.y - low.y) * (high.x - low.x);
+       int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
 
        /* === Shift feature passes to have mean 0. === */
        float feature_means[DENOISE_FEATURES];
-       math_vector_zero(feature_means, DENOISE_FEATURES);
+       math_vector_zero(feature_means, num_features);
        FOR_PIXEL_WINDOW {
-               filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
-               math_vector_add(feature_means, features, DENOISE_FEATURES);
+               filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+               math_vector_add(feature_means, features, num_features);
        } END_FOR_PIXEL_WINDOW
 
-       math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+       math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
 
        /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
-       float *feature_scale = tempvector;
-       math_vector_zero(feature_scale, DENOISE_FEATURES);
+       float feature_scale[DENOISE_FEATURES];
+       math_vector_zero(feature_scale, num_features);
 
        FOR_PIXEL_WINDOW {
-               filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_max(feature_scale, features, DENOISE_FEATURES);
+               filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_max(feature_scale, features, num_features);
        } END_FOR_PIXEL_WINDOW
 
-       filter_calculate_scale(feature_scale);
+       filter_calculate_scale(feature_scale, use_time);
 
        /* === Generate the feature transformation. ===
-        * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+        * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
         * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
-       float* feature_matrix = tempmatrix;
-       math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+       float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+       math_matrix_zero(feature_matrix, num_features);
        FOR_PIXEL_WINDOW {
-               filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_mul(features, feature_scale, DENOISE_FEATURES);
-               math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+               filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_mul(features, feature_scale, num_features);
+               math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
        } END_FOR_PIXEL_WINDOW
 
-       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
        *rank = 0;
        /* Prevent overfitting when a small window is used. */
-       int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+       int max_rank = min(num_features, num_pixels/3);
        if(pca_threshold < 0.0f) {
                float threshold_energy = 0.0f;
-               for(int i = 0; i < DENOISE_FEATURES; i++) {
-                       threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+               for(int i = 0; i < num_features; i++) {
+                       threshold_energy += feature_matrix[i*num_features+i];
                }
                threshold_energy *= 1.0f - (-pca_threshold);
 
@@ -86,13 +87,13 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
                for(int i = 0; i < max_rank; i++, (*rank)++) {
                        if(i >= 2 && reduced_energy >= threshold_energy)
                                break;
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        reduced_energy += s;
                }
        }
        else {
                for(int i = 0; i < max_rank; i++, (*rank)++) {
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        if(i >= 2 && sqrtf(s) < pca_threshold)
                                break;
                }
@@ -100,9 +101,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        /* Bake the feature scaling into the transformation matrix. */
        for(int i = 0; i < (*rank); i++) {
-               math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+               math_vector_mul(transform + i*num_features, feature_scale, num_features);
        }
-       math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+       math_matrix_transpose(transform, num_features, 1);
 }
 
 CCL_NAMESPACE_END
index 83a1222..ed8ddcb 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                  CCL_FILTER_TILE_INFO,
                                                   int x, int y, int4 rect,
-                                                  int pass_stride,
+                                                  int pass_stride, int frame_stride,
+                                                  bool use_time,
                                                   ccl_global float *transform,
                                                   ccl_global int *rank,
                                                   int radius, float pca_threshold,
@@ -33,60 +35,62 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
        float features[DENOISE_FEATURES];
 #endif
 
+       int num_features = use_time? 11 : 10;
+
        /* === Calculate denoising window. === */
        int2 low  = make_int2(max(rect.x, x - radius),
                              max(rect.y, y - radius));
        int2 high = make_int2(min(rect.z, x + radius + 1),
                              min(rect.w, y + radius + 1));
-       int num_pixels = (high.y - low.y) * (high.x - low.x);
+       int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
        const ccl_global float *ccl_restrict pixel_buffer;
-       int2 pixel;
+       int3 pixel;
 
 
 
 
        /* === Shift feature passes to have mean 0. === */
        float feature_means[DENOISE_FEATURES];
-       math_vector_zero(feature_means, DENOISE_FEATURES);
+       math_vector_zero(feature_means, num_features);
        FOR_PIXEL_WINDOW {
-               filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
-               math_vector_add(feature_means, features, DENOISE_FEATURES);
+               filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+               math_vector_add(feature_means, features, num_features);
        } END_FOR_PIXEL_WINDOW
 
-       math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+       math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
 
        /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
        float feature_scale[DENOISE_FEATURES];
-       math_vector_zero(feature_scale, DENOISE_FEATURES);
+       math_vector_zero(feature_scale, num_features);
 
        FOR_PIXEL_WINDOW {
-               filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_max(feature_scale, features, DENOISE_FEATURES);
+               filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_max(feature_scale, features, num_features);
        } END_FOR_PIXEL_WINDOW
 
-       filter_calculate_scale(feature_scale);
+       filter_calculate_scale(feature_scale, use_time);
 
 
 
        /* === Generate the feature transformation. ===
-        * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+        * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
         * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
        float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-       math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+       math_matrix_zero(feature_matrix, num_features);
        FOR_PIXEL_WINDOW {
-               filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_mul(features, feature_scale, DENOISE_FEATURES);
-               math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+               filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_mul(features, feature_scale, num_features);
+               math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
        } END_FOR_PIXEL_WINDOW
 
-       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
        *rank = 0;
        /* Prevent overfitting when a small window is used. */
-       int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+       int max_rank = min(num_features, num_pixels/3);
        if(pca_threshold < 0.0f) {
                float threshold_energy = 0.0f;
-               for(int i = 0; i < DENOISE_FEATURES; i++) {
-                       threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+               for(int i = 0; i < num_features; i++) {
+                       threshold_energy += feature_matrix[i*num_features+i];
                }
                threshold_energy *= 1.0f - (-pca_threshold);
 
@@ -94,24 +98,24 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
                for(int i = 0; i < max_rank; i++, (*rank)++) {
                        if(i >= 2 && reduced_energy >= threshold_energy)
                                break;
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        reduced_energy += s;
                }
        }
        else {
                for(int i = 0; i < max_rank; i++, (*rank)++) {
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        if(i >= 2 && sqrtf(s) < pca_threshold)
                                break;
                }
        }
 
-       math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+       math_matrix_transpose(transform, num_features, transform_stride);
 
        /* Bake the feature scaling into the transformation matrix. */
-       for(int i = 0; i < DENOISE_FEATURES; i++) {
+       for(int i = 0; i < num_features; i++) {
                for(int j = 0; j < (*rank); j++) {
-                       transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+                       transform[(i*num_features + j)*transform_stride] *= feature_scale[i];
                }
        }
 }
index 9e65f61..10bd3e4 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  CCL_FILTER_TILE_INFO,
                                                   int x, int y, int4 rect,
-                                                  int pass_stride,
+                                                  int pass_stride, int frame_stride,
+                                                  bool use_time,
                                                   float *transform, int *rank,
                                                   int radius, float pca_threshold)
 {
@@ -26,55 +28,63 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
        float4 features[DENOISE_FEATURES];
        const float *ccl_restrict pixel_buffer;
-       int2 pixel;
+       int3 pixel;
 
+       int num_features = use_time? 11 : 10;
+
+       /* === Calculate denoising window. === */
        int2 low  = make_int2(max(rect.x, x - radius),
                              max(rect.y, y - radius));
        int2 high = make_int2(min(rect.z, x + radius + 1),
                              min(rect.w, y + radius + 1));
-       int num_pixels = (high.y - low.y) * (high.x - low.x);
+       int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
 
+       /* === Shift feature passes to have mean 0. === */
        float4 feature_means[DENOISE_FEATURES];
-       math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+       math_vector_zero_sse(feature_means, num_features);
        FOR_PIXEL_WINDOW_SSE {
-               filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
-               math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+               filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
+               math_vector_add_sse(feature_means, num_features, features);
        } END_FOR_PIXEL_WINDOW_SSE
 
        float4 pixel_scale = make_float4(1.0f / num_pixels);
-       for(int i = 0; i < DENOISE_FEATURES; i++) {
+       for(int i = 0; i < num_features; i++) {
                feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
        }
 
+       /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
        float4 feature_scale[DENOISE_FEATURES];
-       math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+       math_vector_zero_sse(feature_scale, num_features);
        FOR_PIXEL_WINDOW_SSE {
-               filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+               filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_max_sse(feature_scale, features, num_features);
        } END_FOR_PIXEL_WINDOW_SSE
 
-       filter_calculate_scale_sse(feature_scale);
+       filter_calculate_scale_sse(feature_scale, use_time);
 
+       /* === Generate the feature transformation. ===
+        * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+        * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
        float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
-       math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+       math_matrix_zero_sse(feature_matrix_sse, num_features);
        FOR_PIXEL_WINDOW_SSE {
-               filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
-               math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
-               math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
+               filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+               math_vector_mul_sse(features, num_features, feature_scale);
+               math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
        } END_FOR_PIXEL_WINDOW_SSE
 
        float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
-       math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+       math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
 
-       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+       math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
 
        *rank = 0;
        /* Prevent overfitting when a small window is used. */
-       int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+       int max_rank = min(num_features, num_pixels/3);
        if(pca_threshold < 0.0f) {
                float threshold_energy = 0.0f;
-               for(int i = 0; i < DENOISE_FEATURES; i++) {
-                       threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+               for(int i = 0; i < num_features; i++) {
+                       threshold_energy += feature_matrix[i*num_features+i];
                }
                threshold_energy *= 1.0f - (-pca_threshold);
 
@@ -82,23 +92,23 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
                for(int i = 0; i < max_rank; i++, (*rank)++) {
                        if(i >= 2 && reduced_energy >= threshold_energy)
                                break;
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        reduced_energy += s;
                }
        }
        else {
                for(int i = 0; i < max_rank; i++, (*rank)++) {
-                       float s = feature_matrix[i*DENOISE_FEATURES+i];
+                       float s = feature_matrix[i*num_features+i];
                        if(i >= 2 && sqrtf(s) < pca_threshold)
                                break;
                }
        }
 
-       math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+       math_matrix_transpose(transform, num_features, 1);
 
        /* Bake the feature scaling into the transformation matrix. */
-       for(int i = 0; i < DENOISE_FEATURES; i++) {
-               math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
+       for(int i = 0; i < num_features; i++) {
+               math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank);
        }
 }
 
index 08333c7..02c8556 100644 (file)
@@ -68,6 +68,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
                                                       int r);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           TileInfo *tiles,
                                                            int x,
                                                            int y,
                                                            int storage_ofs,
@@ -75,6 +76,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
                                                            int *rank,
                                                            int* rect,
                                                            int pass_stride,
+                                                           int frame_stride,
+                                                           bool use_time,
                                                            int radius,
                                                            float pca_threshold);
 
@@ -87,6 +90,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            int* rect,
                                                            int stride,
                                                            int channel_offset,
+                                                           int frame_offset,
                                                            float a,
                                                            float k_2);
 
@@ -116,6 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int dy,
+                                                             int t,
                                                              float *difference_image,
                                                              float *buffer,
                                                              float *transform,
@@ -126,7 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int *filter_window,
                                                              int stride,
                                                              int f,
-                                                             int pass_stride);
+                                                             int pass_stride,
+                                                             int frame_offset,
+                                                             bool use_time);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
                                                      float *accum_image,
index b792367..c295058 100644 (file)
@@ -135,6 +135,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           TileInfo *tile_info,
                                                            int x,
                                                            int y,
                                                            int storage_ofs,
@@ -142,6 +143,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
                                                            int *rank,
                                                            int* prefilter_rect,
                                                            int pass_stride,
+                                                           int frame_stride,
+                                                           bool use_time,
                                                            int radius,
                                                            float pca_threshold)
 {
@@ -151,9 +154,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
        rank += storage_ofs;
        transform += storage_ofs*TRANSFORM_SIZE;
        kernel_filter_construct_transform(buffer,
+                                         tile_info,
                                          x, y,
                                          load_int4(prefilter_rect),
                                          pass_stride,
+                                         frame_stride,
+                                         use_time,
                                          transform,
                                          rank,
                                          radius,
@@ -170,6 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            int *rect,
                                                            int stride,
                                                            int channel_offset,
+                                                           int frame_offset,
                                                            float a,
                                                            float k_2)
 {
@@ -184,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                          load_int4(rect),
                                          stride,
                                          channel_offset,
+                                         frame_offset,
                                          a, k_2);
 #endif
 }
@@ -243,6 +251,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int dy,
+                                                             int t,
                                                              float *difference_image,
                                                              float *buffer,
                                                              float *transform,
@@ -253,12 +262,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              int *filter_window,
                                                              int stride,
                                                              int f,
-                                                             int pass_stride)
+                                                             int pass_stride,
+                                                             int frame_offset,
+                                                             bool use_time)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
 #else
-       kernel_filter_nlm_construct_gramian(dx, dy,
+       kernel_filter_nlm_construct_gramian(dx, dy, t,
                                            difference_image,
                                            buffer,
                                            transform, rank,
@@ -266,7 +277,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                            load_int4(rect),
                                            load_int4(filter_window),
                                            stride, f,
-                                           pass_stride);
+                                           pass_stride,
+                                           frame_offset,
+                                           use_time);
 #endif
 }
 
index 3b51bb4..5b552b0 100644 (file)
@@ -29,7 +29,7 @@
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_divide_shadow(int sample,
-                                 TileInfo *tile_info,
+                                 CCL_FILTER_TILE_INFO,
                                  float *unfilteredA,
                                  float *unfilteredB,
                                  float *sampleVariance,
@@ -59,7 +59,7 @@ kernel_cuda_filter_divide_shadow(int sample,
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_get_feature(int sample,
-                               TileInfo *tile_info,
+                               CCL_FILTER_TILE_INFO,
                                int m_offset,
                                int v_offset,
                                float *mean,
@@ -138,10 +138,12 @@ kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+                                       CCL_FILTER_TILE_INFO,
                                        float *transform, int *rank,
                                        int4 filter_area, int4 rect,
                                        int radius, float pca_threshold,
-                                       int pass_stride)
+                                       int pass_stride, int frame_stride,
+                                       bool use_time)
 {
        int x = blockDim.x*blockIdx.x + threadIdx.x;
        int y = blockDim.y*blockIdx.y + threadIdx.y;
@@ -149,8 +151,11 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
                int *l_rank = rank + y*filter_area.z + x;
                float *l_transform = transform + y*filter_area.z + x;
                kernel_filter_construct_transform(buffer,
+                                                 tile_info,
                                                  x + filter_area.x, y + filter_area.y,
-                                                 rect, pass_stride,
+                                                 rect,
+                                                 pass_stride, frame_stride,
+                                                 use_time,
                                                  l_transform, l_rank,
                                                  radius, pca_threshold,
                                                  filter_area.z*filter_area.w,
@@ -170,6 +175,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        int pass_stride,
                                        int r,
                                        int channel_offset,
+                                       int frame_offset,
                                        float a,
                                        float k_2)
 {
@@ -183,6 +189,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                                  difference_image + ofs,
                                                  rect, stride,
                                                  channel_offset,
+                                                 frame_offset,
                                                  a, k_2);
        }
 }
@@ -274,7 +281,8 @@ kernel_cuda_filter_nlm_normalize(float *out_image,
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_construct_gramian(int t,
+                                         const float *ccl_restrict difference_image,
                                          const float *ccl_restrict buffer,
                                          float const* __restrict__ transform,
                                          int *rank,
@@ -286,13 +294,16 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                          int stride,
                                          int pass_stride,
                                          int r,
-                                         int f)
+                                         int f,
+                                         int frame_offset,
+                                         bool use_time)
 {
        int4 co, rect;
        int ofs;
        if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
                kernel_filter_nlm_construct_gramian(co.x, co.y,
                                                    co.z, co.w,
+                                                   t,
                                                    difference_image + ofs,
                                                    buffer,
                                                    transform, rank,
@@ -300,6 +311,8 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                                    rect, filter_window,
                                                    stride, f,
                                                    pass_stride,
+                                                   frame_offset,
+                                                   use_time,
                                                    threadIdx.y*blockDim.x + threadIdx.x);
        }
 }
index 8a821ee..996bc27 100644 (file)
@@ -127,11 +127,14 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
 }
 
 __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                    CCL_FILTER_TILE_INFO,
                                                     ccl_global float *transform,
                                                     ccl_global int *rank,
                                                     int4 filter_area,
                                                     int4 rect,
                                                     int pass_stride,
+                                                    int frame_stride,
+                                                    char use_time,
                                                     int radius,
                                                     float pca_threshold)
 {
@@ -141,8 +144,11 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
                ccl_global int *l_rank = rank + y*filter_area.z + x;
                ccl_global float *l_transform = transform + y*filter_area.z + x;
                kernel_filter_construct_transform(buffer,
+                                                 CCL_FILTER_TILE_INFO_ARG,
                                                  x + filter_area.x, y + filter_area.y,
-                                                 rect, pass_stride,
+                                                 rect,
+                                                 pass_stride, frame_stride,
+                                                 use_time,
                                                  l_transform, l_rank,
                                                  radius, pca_threshold,
                                                  filter_area.z*filter_area.w,
@@ -160,6 +166,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                     int pass_stride,
                                                     int r,
                                                     int channel_offset,
+                                                    int frame_offset,
                                                     float a,
                                                     float k_2)
 {
@@ -173,6 +180,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                  difference_image + ofs,
                                                  rect, stride,
                                                  channel_offset,
+                                                 frame_offset,
                                                  a, k_2);
        }
 }
@@ -254,7 +262,8 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
        }
 }
 
-__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
+                                                      const ccl_global float *ccl_restrict difference_image,
                                                       const ccl_global float *ccl_restrict buffer,
                                                       const ccl_global float *ccl_restrict transform,
                                                       ccl_global int *rank,
@@ -266,13 +275,16 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                       int stride,
                                                       int pass_stride,
                                                       int r,
-                                                      int f)
+                                                      int f,
+                                                      int frame_offset,
+                                                      char use_time)
 {
        int4 co, rect;
        int ofs;
        if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
                kernel_filter_nlm_construct_gramian(co.x, co.y,
                                                    co.z, co.w,
+                                                   t,
                                                    difference_image + ofs,
                                                    buffer,
                                                    transform, rank,
@@ -280,6 +292,8 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                    rect, filter_window,
                                                    stride, f,
                                                    pass_stride,
+                                                   frame_offset,
+                                                   use_time,
                                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
        }
 }