Merge branch 'blender2.7'
[blender.git] / intern / cycles / device / device_cuda.cpp
index 67f5793..ada538a 100644 (file)
@@ -1300,7 +1300,8 @@ public:
 
                int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
-               int channel_offset = 0;
+               int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
+               int frame_offset = 0;
 
                if(have_error())
                        return false;
@@ -1308,6 +1309,7 @@ public:
                CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
                CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
                CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts;
+               CUdeviceptr scale_ptr = 0;
 
                cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride));
                cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride));
@@ -1326,10 +1328,10 @@ public:
 
                        CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
 
-                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &a, &k_2};
+                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
                        void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
                        void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-                       void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &r, &f};
+                       void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
 
                        CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
                        CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1366,32 +1368,33 @@ public:
                                   task->storage.h);
 
                void *args[] = {&task->buffer.mem.device_pointer,
+                               &task->tile_info_mem.device_pointer,
                                &task->storage.transform.device_pointer,
                                &task->storage.rank.device_pointer,
                                &task->filter_area,
                                &task->rect,
                                &task->radius,
                                &task->pca_threshold,
-                               &task->buffer.pass_stride};
+                               &task->buffer.pass_stride,
+                               &task->buffer.frame_stride,
+                               &task->buffer.use_time};
                CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
                cuda_assert(cuCtxSynchronize());
 
                return !have_error();
        }
 
-       bool denoising_reconstruct(device_ptr color_ptr,
-                                  device_ptr color_variance_ptr,
-                                  device_ptr output_ptr,
-                                  DenoisingTask *task)
+       bool denoising_accumulate(device_ptr color_ptr,
+                                 device_ptr color_variance_ptr,
+                                 device_ptr scale_ptr,
+                                 int frame,
+                                 DenoisingTask *task)
        {
                if(have_error())
                        return false;
 
                CUDAContextScope scope(this);
 
-               mem_zero(task->storage.XtWX);
-               mem_zero(task->storage.XtWY);
-
                int r = task->radius;
                int f = 4;
                float a = 1.0f;
@@ -1400,6 +1403,8 @@ public:
                int w = task->reconstruction_state.source_w;
                int h = task->reconstruction_state.source_h;
                int stride = task->buffer.stride;
+               int frame_offset = frame * task->buffer.frame_stride;
+               int t = task->tile_info->frames[frame];
 
                int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
@@ -1410,60 +1415,73 @@ public:
                CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
                CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
 
-               {
-                       CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-                       cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-                       CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                                            task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-                                            num_shifts);
-
-                       void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
-                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-                       void *construct_gramian_args[] = {&blurDifference,
-                                                         &task->buffer.mem.device_pointer,
-                                                         &task->storage.transform.device_pointer,
-                                                         &task->storage.rank.device_pointer,
-                                                         &task->storage.XtWX.device_pointer,
-                                                         &task->storage.XtWY.device_pointer,
-                                                         &task->reconstruction_state.filter_window,
-                                                         &w, &h, &stride,
-                                                         &pass_stride, &r,
-                                                         &f};
-
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-               }
+               CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+               cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+               cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+               cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+               cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+               CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                                    task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+                                    num_shifts);
+
+               void *calc_difference_args[] = {&color_ptr,
+                                               &color_variance_ptr,
+                                               &scale_ptr,
+                                               &difference,
+                                               &w, &h,
+                                               &stride, &pass_stride,
+                                               &r, &pass_stride,
+                                               &frame_offset,
+                                               &a, &k_2};
+               void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+               void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+               void *construct_gramian_args[] = {&t,
+                                                 &blurDifference,
+                                                 &task->buffer.mem.device_pointer,
+                                                 &task->storage.transform.device_pointer,
+                                                 &task->storage.rank.device_pointer,
+                                                 &task->storage.XtWX.device_pointer,
+                                                 &task->storage.XtWY.device_pointer,
+                                                 &task->reconstruction_state.filter_window,
+                                                 &w, &h, &stride,
+                                                 &pass_stride, &r,
+                                                 &f,
+                                                 &frame_offset,
+                                                 &task->buffer.use_time};
+
+               CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+               CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+               CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+               CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+               CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+               cuda_assert(cuCtxSynchronize());
 
-               {
-                       CUfunction cuFinalize;
-                       cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-                       cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-                       void *finalize_args[] = {&output_ptr,
-                                                        &task->storage.rank.device_pointer,
-                                                        &task->storage.XtWX.device_pointer,
-                                                        &task->storage.XtWY.device_pointer,
-                                                        &task->filter_area,
-                                                        &task->reconstruction_state.buffer_params.x,
-                                                        &task->render_buffer.samples};
-                       CUDA_GET_BLOCKSIZE(cuFinalize,
-                                          task->reconstruction_state.source_w,
-                                          task->reconstruction_state.source_h);
-                       CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-               }
+               return !have_error();
+       }
 
+       bool denoising_solve(device_ptr output_ptr,
+                            DenoisingTask *task)
+       {
+               CUfunction cuFinalize;
+               cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+               cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+               void *finalize_args[] = {&output_ptr,
+                                        &task->storage.rank.device_pointer,
+                                        &task->storage.XtWX.device_pointer,
+                                        &task->storage.XtWY.device_pointer,
+                                        &task->filter_area,
+                                        &task->reconstruction_state.buffer_params.x,
+                                        &task->render_buffer.samples};
+               CUDA_GET_BLOCKSIZE(cuFinalize,
+                                  task->reconstruction_state.source_w,
+                                  task->reconstruction_state.source_h);
+               CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
                cuda_assert(cuCtxSynchronize());
 
                return !have_error();
@@ -1533,6 +1551,7 @@ public:
                                   int variance_offset,
                                   device_ptr mean_ptr,
                                   device_ptr variance_ptr,
+                                  float scale,
                                   DenoisingTask *task)
        {
                if(have_error())
@@ -1553,6 +1572,7 @@ public:
                                &variance_offset,
                                &mean_ptr,
                                &variance_ptr,
+                               &scale,
                                &task->rect,
                                &task->render_buffer.pass_stride,
                                &task->render_buffer.offset};
@@ -1562,6 +1582,36 @@ public:
                return !have_error();
        }
 
+       bool denoising_write_feature(int out_offset,
+                                    device_ptr from_ptr,
+                                    device_ptr buffer_ptr,
+                                    DenoisingTask *task)
+       {
+               if(have_error())
+                       return false;
+
+               CUDAContextScope scope(this);
+
+               CUfunction cuFilterWriteFeature;
+               cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+               cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+               CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
+                                  task->filter_area.z,
+                                  task->filter_area.w);
+
+               void *args[] = {&task->render_buffer.samples,
+                               &task->reconstruction_state.buffer_params,
+                               &task->filter_area,
+                               &from_ptr,
+                               &buffer_ptr,
+                               &out_offset,
+                               &task->rect};
+               CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+               cuda_assert(cuCtxSynchronize());
+
+               return !have_error();
+       }
+
        bool denoising_detect_outliers(device_ptr image_ptr,
                                       device_ptr variance_ptr,
                                       device_ptr depth_ptr,
@@ -1596,11 +1646,13 @@ public:
        void denoise(RenderTile &rtile, DenoisingTask& denoising)
        {
                denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
-               denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+               denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+               denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
                denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
                denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
                denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-               denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+               denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+               denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
                denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
 
                denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);