Merge branch 'master' into blender2.8
authorCampbell Barton <ideasman42@gmail.com>
Mon, 8 Oct 2018 21:46:00 +0000 (08:46 +1100)
committerCampbell Barton <ideasman42@gmail.com>
Mon, 8 Oct 2018 21:46:00 +0000 (08:46 +1100)
17 files changed:
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_denoising.cpp
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/kernel/filter/filter_nlm_cpu.h
intern/cycles/kernel/kernels/cpu/filter_cpu.h
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
intern/cycles/kernel/kernels/cuda/filter.cu
intern/cycles/kernel/kernels/opencl/filter.cl
intern/cycles/util/util_math.h
intern/cycles/util/util_math_fast.h
intern/cycles/util/util_math_float4.h
intern/cycles/util/util_math_int4.h
intern/cycles/util/util_types_int4.h
intern/cycles/util/util_types_int4_impl.h
source/blender/bmesh/operators/bmo_subdivide.c
tests/python/modules/render_report.py

index 7c72ab1a00931618522982646be7b56f384f29d1..7eb73dea3ef610300d447a4078a6a5ca1ef0333c 100644 (file)
@@ -184,11 +184,11 @@ public:
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
 
-       KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
-       KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
-       KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_calc_weight_kernel;
-       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
-       KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)>   filter_nlm_calc_difference_kernel;
+       KernelFunctions<void(*)(float*, float*, int*, int, int)>                                   filter_nlm_blur_kernel;
+       KernelFunctions<void(*)(float*, float*, int*, int, int)>                                   filter_nlm_calc_weight_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
+       KernelFunctions<void(*)(float*, float*, int*, int)>                                        filter_nlm_normalize_kernel;
 
        KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
        KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
@@ -499,6 +499,7 @@ public:
                        filter_nlm_update_output_kernel()(dx, dy,
                                                          blurDifference,
                                                          (float*) image_ptr,
+                                                         difference,
                                                          (float*) out_ptr,
                                                          weightAccum,
                                                          local_rect,
index 2af8a9f48c87f06b3b08e941ffa26c6ea94e9840..22b1bc493c848b67c4aa67773342cc99bdf1edd3 100644 (file)
@@ -1397,18 +1397,14 @@ public:
                int h = task->reconstruction_state.source_h;
                int stride = task->buffer.stride;
 
-               int shift_stride = stride*h;
+               int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
-               int mem_size = sizeof(float)*shift_stride*num_shifts;
-
-               device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem");
-               temporary_mem.alloc_to_device(2*mem_size);
 
                if(have_error())
                        return false;
 
-               CUdeviceptr difference     = cuda_device_ptr(temporary_mem.device_pointer);
-               CUdeviceptr blurDifference = difference + mem_size;
+               CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+               CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
 
                {
                        CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
@@ -1426,9 +1422,9 @@ public:
                                             task->reconstruction_state.source_w * task->reconstruction_state.source_h,
                                             num_shifts);
 
-                       void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2};
-                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
-                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+                       void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
+                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
                        void *construct_gramian_args[] = {&blurDifference,
                                                          &task->buffer.mem.device_pointer,
                                                          &task->storage.transform.device_pointer,
@@ -1437,9 +1433,8 @@ public:
                                                          &task->storage.XtWY.device_pointer,
                                                          &task->reconstruction_state.filter_window,
                                                          &w, &h, &stride,
-                                                         &shift_stride, &r,
-                                                         &f,
-                                                     &task->buffer.pass_stride};
+                                                         &pass_stride, &r,
+                                                         &f};
 
                        CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
                        CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1448,8 +1443,6 @@ public:
                        CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
                }
 
-               temporary_mem.free();
-
                {
                        CUfunction cuFinalize;
                        cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
index 23c18fa15b2cf678d9a3320115ab85ccacf2a318..78c65a3d22d5b1e8ffeaf6a93b85a48f6683f1f3 100644 (file)
@@ -99,14 +99,18 @@ void DenoisingTask::setup_denoising_buffer()
        buffer.mem.alloc_to_device(mem_size, false);
 
        /* CPUs process shifts sequentially while GPUs process them in parallel. */
-       int num_shifts = 1;
+       int num_layers;
        if(buffer.gpu_temporary_mem) {
                /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
                int max_radius = max(radius, 6);
-               num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
+               int num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
+               num_layers = 2*num_shifts + 1;
+       }
+       else {
+               num_layers = 3;
        }
        /* Allocate two layers per shift as well as one for the weight accumulation. */
-       buffer.temporary_mem.alloc_to_device((2*num_shifts + 1) * buffer.pass_stride);
+       buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
 }
 
 void DenoisingTask::prefilter_shadowing()
index cc887134bb05ba37f67cc60af7e323b159e46220..ffd3b7d762521f1c002f2bd55b1ad1124619af0d 100644 (file)
@@ -865,38 +865,38 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        int h = task->reconstruction_state.source_h;
        int stride = task->buffer.stride;
 
-       int shift_stride = stride*h;
-       int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
-       int mem_size = sizeof(float)*shift_stride*num_shifts;
+       int r = task->radius;
+       int pass_stride = task->buffer.pass_stride;
+       int num_shifts = (2*r+1)*(2*r+1);
 
-       cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
-       opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
-       cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
-       opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+       device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
+       device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
+       cl_mem difference_mem = CL_MEM_PTR(*difference);
+       cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
 
        kernel_set_args(ckNLMCalcDifference, 0,
                        color_mem,
                        color_variance_mem,
-                       difference,
+                       difference_mem,
                        w, h, stride,
-                       shift_stride,
-                       task->radius,
-                       task->buffer.pass_stride,
+                       pass_stride,
+                       r,
+                       pass_stride,
                        1.0f, task->nlm_k_2);
        kernel_set_args(ckNLMBlur, 0,
-                       difference,
-                       blurDifference,
+                       difference_mem,
+                       blurDifference_mem,
                        w, h, stride,
-                       shift_stride,
-                       task->radius, 4);
+                       pass_stride,
+                       r, 4);
        kernel_set_args(ckNLMCalcWeight, 0,
-                       blurDifference,
-                       difference,
+                       blurDifference_mem,
+                       difference_mem,
                        w, h, stride,
-                       shift_stride,
-                       task->radius, 4);
+                       pass_stride,
+                       r, 4);
        kernel_set_args(ckNLMConstructGramian, 0,
-                       blurDifference,
+                       blurDifference_mem,
                        buffer_mem,
                        transform_mem,
                        rank_mem,
@@ -904,9 +904,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
                        XtWY_mem,
                        task->reconstruction_state.filter_window,
                        w, h, stride,
-                       shift_stride,
-                       task->radius, 4,
-                       task->buffer.pass_stride);
+                       pass_stride,
+                       r, 4);
 
        enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
        enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
@@ -914,9 +913,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
        enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
 
-       opencl_assert(clReleaseMemObject(difference));
-       opencl_assert(clReleaseMemObject(blurDifference));
-
        kernel_set_args(ckFinalize, 0,
                        output_mem,
                        rank_mem,
index e2da0fd872bce418731e78ab9face67bb3b57b28..af73c0dadf26585eed73ea872c9cde98e15ffb93 100644 (file)
@@ -16,6 +16,9 @@
 
 CCL_NAMESPACE_BEGIN
 
+#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs))))
+#define load4_u(buf, ofs) load_float4((buf)+(ofs))
+
 ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          const float *ccl_restrict weight_image,
                                                          const float *ccl_restrict variance_image,
@@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          float a,
                                                          float k_2)
 {
+       /* Strides need to be aligned to 16 bytes. */
+       kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
+
+       int aligned_lowx = rect.x & (~3);
+       const int numChannels = (channel_offset > 0)? 3 : 1;
+       const float4 channel_fac = make_float4(1.0f / numChannels);
+
        for(int y = rect.y; y < rect.w; y++) {
-               for(int x = rect.x; x < rect.z; x++) {
-                       float diff = 0.0f;
-                       int numChannels = channel_offset? 3 : 1;
-                       for(int c = 0; c < numChannels; c++) {
-                               float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
-                               float pvar = variance_image[c*channel_offset + y*stride + x];
-                               float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
-                               diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
-                       }
-                       if(numChannels > 1) {
-                               diff *= 1.0f/numChannels;
+               int idx_p = y*stride + aligned_lowx;
+               int idx_q = (y+dy)*stride + aligned_lowx + dx;
+               for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
+                       float4 diff = make_float4(0.0f);
+                       for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
+                               /* idx_p is guaranteed to be aligned, but idx_q isn't. */
+                               float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
+                               float4 color_q = load4_u(weight_image, idx_q + chan_ofs);
+                               float4 cdiff = color_p - color_q;
+                               float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
+                               float4 var_q = load4_u(variance_image, idx_q + chan_ofs);
+                               diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
                        }
-                       difference_image[y*stride + x] = diff;
+                       load4_a(difference_image, idx_p) = diff*channel_fac;
                }
        }
 }
@@ -50,52 +61,77 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                                               int stride,
                                               int f)
 {
-       int aligned_lowx = rect.x / 4;
-       int aligned_highx = (rect.z + 3) / 4;
+       int aligned_lowx = round_down(rect.x, 4);
        for(int y = rect.y; y < rect.w; y++) {
                const int low = max(rect.y, y-f);
                const int high = min(rect.w, y+f+1);
-               for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*stride + x] = 0.0f;
+               for(int x = aligned_lowx; x < rect.z; x += 4) {
+                       load4_a(out_image, y*stride + x) = make_float4(0.0f);
                }
                for(int y1 = low; y1 < high; y1++) {
-                       float4* out_image4 = (float4*)(out_image + y*stride);
-                       float4* difference_image4 = (float4*)(difference_image + y1*stride);
-                       for(int x = aligned_lowx; x < aligned_highx; x++) {
-                               out_image4[x] += difference_image4[x];
+                       for(int x = aligned_lowx; x < rect.z; x += 4) {
+                               load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x);
                        }
                }
-               for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*stride + x] *= 1.0f/(high - low);
+               float fac = 1.0f/(high - low);
+               for(int x = aligned_lowx; x < rect.z; x += 4) {
+                       load4_a(out_image, y*stride + x) *= fac;
                }
        }
 }
 
-ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                                     float *out_image,
-                                                     int4 rect,
-                                                     int stride,
-                                                     int f)
+ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image,
+                                           float *out_image,
+                                           int4 rect,
+                                           int stride,
+                                           int f)
 {
+       int aligned_lowx = round_down(rect.x, 4);
        for(int y = rect.y; y < rect.w; y++) {
-               for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*stride + x] = 0.0f;
+               for(int x = aligned_lowx; x < rect.z; x += 4) {
+                       load4_a(out_image, y*stride + x) = make_float4(0.0f);
                }
        }
+
        for(int dx = -f; dx <= f; dx++) {
-               int pos_dx = max(0, dx);
-               int neg_dx = min(0, dx);
+               aligned_lowx = round_down(rect.x - min(0, dx), 4);
+               int highx = rect.z - max(0, dx);
+               int4 lowx4 = make_int4(rect.x - min(0, dx));
+               int4 highx4 = make_int4(rect.z - max(0, dx));
                for(int y = rect.y; y < rect.w; y++) {
-                       for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
-                               out_image[y*stride + x] += difference_image[y*stride + x+dx];
+                       for(int x = aligned_lowx; x < highx; x += 4) {
+                               int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+                               int4 active = (x4 >= lowx4) & (x4 < highx4);
+
+                               float4 diff = load4_u(difference_image, y*stride + x + dx);
+                               load4_a(out_image, y*stride + x) += mask(active, diff);
                        }
                }
        }
+
+       aligned_lowx = round_down(rect.x, 4);
        for(int y = rect.y; y < rect.w; y++) {
-               for(int x = rect.x; x < rect.z; x++) {
-                       const int low = max(rect.x, x-f);
-                       const int high = min(rect.z, x+f+1);
-                       out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
+               for(int x = aligned_lowx; x < rect.z; x += 4) {
+                       float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
+                       float4 low = max(make_float4(rect.x), x4 - make_float4(f));
+                       float4 high = min(make_float4(rect.z), x4 + make_float4(f+1));
+                       load4_a(out_image, y*stride + x) *= rcp(high - low);
+               }
+       }
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                                     float *out_image,
+                                                     int4 rect,
+                                                     int stride,
+                                                     int f)
+{
+       nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
+
+       int aligned_lowx = round_down(rect.x, 4);
+       for(int y = rect.y; y < rect.w; y++) {
+               for(int x = aligned_lowx; x < rect.z; x += 4) {
+                       load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f)));
                }
        }
 }
@@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
 ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                                                        const float *ccl_restrict difference_image,
                                                        const float *ccl_restrict image,
+                                                       float *temp_image,
                                                        float *out_image,
                                                        float *accum_image,
                                                        int4 rect,
                                                        int stride,
                                                        int f)
 {
+       nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
+
+       int aligned_lowx = round_down(rect.x, 4);
        for(int y = rect.y; y < rect.w; y++) {
-               for(int x = rect.x; x < rect.z; x++) {
-                       const int low = max(rect.x, x-f);
-                       const int high = min(rect.z, x+f+1);
-                       float sum = 0.0f;
-                       for(int x1 = low; x1 < high; x1++) {
-                               sum += difference_image[y*stride + x1];
-                       }
-                       float weight = sum * (1.0f/(high - low));
-                       accum_image[y*stride + x] += weight;
-                       out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
+               for(int x = aligned_lowx; x < rect.z; x += 4) {
+                       int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+                       int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
+
+                       int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+
+                       float4 weight = load4_a(temp_image, idx_p);
+                       load4_a(accum_image, idx_p) += mask(active, weight);
+
+                       float4 val = load4_u(image, idx_q);
+
+                       load4_a(out_image, idx_p) += mask(active, weight*val);
                }
        }
 }
@@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
        }
 }
 
+#undef load4_a
+#undef load4_u
+
 CCL_NAMESPACE_END
index b62aa9663ec5280245ac6f2886e70fdd9ba49cf2..e036b53b8108b0802c16fa4eb37218cd2e728d41 100644 (file)
@@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
                                                          float *difference_image,
                                                          float *image,
+                                                         float *temp_image,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int* rect,
index 26777fdabb2b1dac49e2d125bc807200c60a3897..4c75871148157ff38311add27f794bc31375486b 100644 (file)
@@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
                                                          float *difference_image,
                                                          float *image,
+                                                         float *temp_image,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int *rect,
@@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
 #else
-       kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
+       kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
 #endif
 }
 
index 0561c40e6b18ba5b15143f46a9ef119d820265cd..b856cbde45c487fdab9d7c127465951bcf6e2008 100644 (file)
@@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        int w,
                                        int h,
                                        int stride,
-                                       int shift_stride,
+                                       int pass_stride,
                                        int r,
                                        int channel_offset,
                                        float a,
@@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
                                                  weight_image,
                                                  variance_image,
@@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
                             int w,
                             int h,
                             int stride,
-                            int shift_stride,
+                            int pass_stride,
                             int r,
                             int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_blur(co.x, co.y,
                                       difference_image + ofs,
                                       out_image + ofs,
@@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
                                    int w,
                                    int h,
                                    int stride,
-                                   int shift_stride,
+                                   int pass_stride,
                                    int r,
                                    int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_calc_weight(co.x, co.y,
                                              difference_image + ofs,
                                              out_image + ofs,
@@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      int w,
                                      int h,
                                      int stride,
-                                     int shift_stride,
+                                     int pass_stride,
                                      int r,
                                      int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
                                                difference_image + ofs,
                                                image,
@@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
-                                         int f,
-                                         int pass_stride)
+                                         int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+       if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
                kernel_filter_nlm_construct_gramian(co.x, co.y,
                                                    co.z, co.w,
                                                    difference_image + ofs,
index 3c75754fb3917853be7586f776908f4312eb2bb0..a550f97f4eb6839d7f6266fa5c8567e71220adc3 100644 (file)
@@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                     int w,
                                                     int h,
                                                     int stride,
-                                                    int shift_stride,
+                                                    int pass_stride,
                                                     int r,
                                                     int channel_offset,
                                                     float a,
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
                                                  weight_image,
                                                  variance_image,
@@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
                                          int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_blur(co.x, co.y,
                                       difference_image + ofs,
                                       out_image + ofs,
@@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest
                                                 int w,
                                                 int h,
                                                 int stride,
-                                                int shift_stride,
+                                                int pass_stride,
                                                 int r,
                                                 int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_calc_weight(co.x, co.y,
                                              difference_image + ofs,
                                              out_image + ofs,
@@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
                                                   int w,
                                                   int h,
                                                   int stride,
-                                                  int shift_stride,
+                                                  int pass_stride,
                                                   int r,
                                                   int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
                                                difference_image + ofs,
                                                image,
@@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                       int w,
                                                       int h,
                                                       int stride,
-                                                      int shift_stride,
+                                                      int pass_stride,
                                                       int r,
-                                                      int f,
-                                                      int pass_stride)
+                                                      int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+       if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
                kernel_filter_nlm_construct_gramian(co.x, co.y,
                                                    co.z, co.w,
                                                    difference_image + ofs,
index 52aeb8d85995f5eee438d1812413f6a25f1d4d9f..eafae5f31c0304cd280b2bd0a6ca52242b71d707 100644 (file)
@@ -220,6 +220,30 @@ ccl_device_inline float __uint_as_float(uint i)
        u.i = i;
        return u.f;
 }
+
+ccl_device_inline int4 __float4_as_int4(float4 f)
+{
+#ifdef __KERNEL_SSE__
+       return int4(_mm_castps_si128(f.m128));
+       #else
+       return make_int4(__float_as_int(f.x),
+                        __float_as_int(f.y),
+                        __float_as_int(f.z),
+                        __float_as_int(f.w));
+#endif
+}
+
+ccl_device_inline float4 __int4_as_float4(int4 i)
+{
+#ifdef __KERNEL_SSE__
+       return float4(_mm_castsi128_ps(i.m128));
+#else
+       return make_float4(__int_as_float(i.x),
+                          __int_as_float(i.y),
+                          __int_as_float(i.z),
+                          __int_as_float(i.w));
+#endif
+}
 #endif /* __KERNEL_OPENCL__ */
 
 /* Versions of functions which are safe for fast math. */
index d3960deb3b4dbfe882e74537514bed771db986b5..323d40058e5997c6dc8b8c6d980e5977b5ff75ad 100644 (file)
@@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c)
        return a * b + c;
 }
 
+ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
+{
+       return a * b + c;
+}
+
 /*
  * FAST & APPROXIMATE MATH
  *
@@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x)
        return fast_exp2f(x / M_LN2_F);
 }
 
+#ifndef __KERNEL_GPU__
+ccl_device float4 fast_exp2f4(float4 x)
+{
+       const float4 one = make_float4(1.0f);
+       const float4 limit = make_float4(126.0f);
+       x = clamp(x, -limit, limit);
+       int4 m = make_int4(x);
+       x = one - (one - (x - make_float4(m)));
+       float4 r = make_float4(1.33336498402e-3f);
+       r = madd4(x, r, make_float4(9.810352697968e-3f));
+       r = madd4(x, r, make_float4(5.551834031939e-2f));
+       r = madd4(x, r, make_float4(0.2401793301105f));
+       r = madd4(x, r, make_float4(0.693144857883f));
+       r = madd4(x, r, make_float4(1.0f));
+       return __int4_as_float4(__float4_as_int4(r) + (m << 23));
+}
+
+ccl_device_inline float4 fast_expf4(float4 x)
+{
+       return fast_exp2f4(x / M_LN2_F);
+}
+#endif
+
 ccl_device_inline float fast_exp10(float x)
 {
        /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
index aa7e56fefe99ca61d8966dfa106e449dfc724c6e..105547098b5d721d6cdb7c7a58c37763622b76af 100644 (file)
@@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b);
 ccl_device_inline float4 operator-(const float4& a, const float4& b);
 ccl_device_inline float4 operator+=(float4& a, const float4& b);
 ccl_device_inline float4 operator*=(float4& a, const float4& b);
+ccl_device_inline float4 operator*=(float4& a, float f);
 ccl_device_inline float4 operator/=(float4& a, float f);
 
 ccl_device_inline int4 operator<(const float4& a, const float4& b);
@@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a);
 ccl_device_inline float4 safe_normalize(const float4& a);
 ccl_device_inline float4 min(const float4& a, const float4& b);
 ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx);
 ccl_device_inline float4 fabs(const float4& a);
 #endif  /* !__KERNEL_OPENCL__*/
 
@@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b)
        return a = a * b;
 }
 
+ccl_device_inline float4 operator*=(float4& a, float f)
+{
+       return a = a * f;
+}
+
 ccl_device_inline float4 operator/=(float4& a, float f)
 {
        return a = a / f;
@@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
 #endif
 }
 
+ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx)
+{
+       return min(max(a, mn), mx);
+}
+
 ccl_device_inline float4 fabs(const float4& a)
 {
 #ifdef __KERNEL_SSE__
index 79a8c0841e747a80b9162c1669528d47f0bcc149..cde366b8c27b07622ac432be665e55def8ff3ab0 100644 (file)
@@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline int4 operator+(const int4& a, const int4& b);
 ccl_device_inline int4 operator+=(int4& a, const int4& b);
 ccl_device_inline int4 operator>>(const int4& a, int i);
+ccl_device_inline int4 operator<<(const int4& a, int i);
+ccl_device_inline int4 operator<(const int4& a, const int4& b);
+ccl_device_inline int4 operator>=(const int4& a, const int4& b);
+ccl_device_inline int4 operator&(const int4& a, const int4& b);
 ccl_device_inline int4 min(int4 a, int4 b);
 ccl_device_inline int4 max(int4 a, int4 b);
 ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx);
@@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i)
 #endif
 }
 
+ccl_device_inline int4 operator<<(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+       return int4(_mm_slli_epi32(a.m128, i));
+#else
+       return make_int4(a.x << i, a.y << i, a.z << i, a.w << i);
+#endif
+}
+
+ccl_device_inline int4 operator<(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+       return int4(_mm_cmplt_epi32(a.m128, b.m128));
+#else
+       return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+ccl_device_inline int4 operator>=(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+       return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
+#else
+       return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+ccl_device_inline int4 operator&(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+       return int4(_mm_and_si128(a.m128, b.m128));
+#else
+       return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
+#endif
+}
+
 ccl_device_inline int4 min(int4 a, int4 b)
 {
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
index cdd0ecbdae52fa4b3cede53ec7a734528975e2d6..4ef162f1ac66093fe6af9e64fc0b991ea15ddf78 100644 (file)
@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
 #ifndef __KERNEL_GPU__
 
 struct float3;
+struct float4;
 
 struct ccl_try_align(16) int4 {
 #ifdef __KERNEL_SSE__
@@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 {
 ccl_device_inline int4 make_int4(int i);
 ccl_device_inline int4 make_int4(int x, int y, int z, int w);
 ccl_device_inline int4 make_int4(const float3& f);
+ccl_device_inline int4 make_int4(const float4& f);
 ccl_device_inline void print_int4(const char *label, const int4& a);
 #endif  /* __KERNEL_GPU__ */
 
index 07cdc88f2dcd7ab7f7916028abb06af667746e38..a62561709deb9a9ef7b03457333b38b95a01df34 100644 (file)
@@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f)
        return a;
 }
 
+ccl_device_inline int4 make_int4(const float4& f)
+{
+#ifdef __KERNEL_SSE__
+       int4 a(_mm_cvtps_epi32(f.m128));
+#else
+       int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+       return a;
+}
+
 ccl_device_inline void print_int4(const char *label, const int4& a)
 {
        printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
index c8ba2134a730bd84754665e3729d515113c19f08..66730f8c8371c629e08bc1efc9106d037b56e0f4 100644 (file)
@@ -182,15 +182,12 @@ static void interp_slerp_co_no_v3(
        /* calculate sphere 'center' */
        {
                /* use point on plane to */
-               float plane_a[4], plane_b[4], plane_c[4];
                float no_mid[3], no_ortho[3];
                /* pass this as an arg instead */
 #if 0
                float no_dir[3];
 #endif
 
-               float v_a_no_ortho[3], v_b_no_ortho[3];
-
                add_v3_v3v3(no_mid, no_a, no_b);
                normalize_v3(no_mid);
 
@@ -200,24 +197,28 @@ static void interp_slerp_co_no_v3(
 #endif
 
                /* axis of slerp */
+               bool center_ok = false;
                cross_v3_v3v3(no_ortho, no_mid, no_dir);
-               normalize_v3(no_ortho);
-
-               /* create planes */
-               cross_v3_v3v3(v_a_no_ortho, no_ortho, no_a);
-               cross_v3_v3v3(v_b_no_ortho, no_ortho, no_b);
-               project_v3_plane(v_a_no_ortho, no_ortho, v_a_no_ortho);
-               project_v3_plane(v_b_no_ortho, no_ortho, v_b_no_ortho);
-
-               plane_from_point_normal_v3(plane_a, co_a, v_a_no_ortho);
-               plane_from_point_normal_v3(plane_b, co_b, v_b_no_ortho);
-               plane_from_point_normal_v3(plane_c, co_b, no_ortho);
-
-               /* find the sphere center from 3 planes */
-               if (isect_plane_plane_plane_v3(plane_a, plane_b, plane_c, center)) {
-                       /* pass */
+               if (normalize_v3(no_ortho) != 0.0f) {
+                       float plane_a[4], plane_b[4], plane_c[4];
+                       float v_a_no_ortho[3], v_b_no_ortho[3];
+
+                       /* create planes */
+                       cross_v3_v3v3(v_a_no_ortho, no_ortho, no_a);
+                       cross_v3_v3v3(v_b_no_ortho, no_ortho, no_b);
+                       project_v3_plane(v_a_no_ortho, no_ortho, v_a_no_ortho);
+                       project_v3_plane(v_b_no_ortho, no_ortho, v_b_no_ortho);
+
+                       plane_from_point_normal_v3(plane_a, co_a, v_a_no_ortho);
+                       plane_from_point_normal_v3(plane_b, co_b, v_b_no_ortho);
+                       plane_from_point_normal_v3(plane_c, co_b, no_ortho);
+
+                       /* find the sphere center from 3 planes */
+                       if (isect_plane_plane_plane_v3(plane_a, plane_b, plane_c, center)) {
+                               center_ok = true;
+                       }
                }
-               else {
+               if (center_ok == false) {
                        mid_v3_v3v3(center, co_a, co_b);
                }
        }
index 8deed25e8543eda5e2d4cb9564b86c08c4a326b3..9cff85f3dfa83303ac2c9c0921eb7a90174ee08e 100755 (executable)
@@ -374,7 +374,7 @@ class Report:
         testname = test_get_name(filepath)
         print_message(testname, 'SUCCESS', 'RUN')
         time_start = time.time()
-        tmp_filepath = os.path.join(self.output_dir, "tmp")
+        tmp_filepath = os.path.join(self.output_dir, "tmp_" + testname)
 
         error = render_cb(filepath, tmp_filepath)
         status = "FAIL"