Fix OpenCL performance regression after cubic interpolation.
authorBrecht Van Lommel <brechtvanlommel@gmail.com>
Sun, 15 Oct 2017 15:40:01 +0000 (17:40 +0200)
committerBrecht Van Lommel <brechtvanlommel@gmail.com>
Sun, 15 Oct 2017 15:46:50 +0000 (17:46 +0200)
Reorganize code to reduce register pressure.

intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h

index b7be4fe440964b5b7f63d94b34d9f468866b0f44..5ca07eaeb05054b421316d84a6fb5e8d0a652ea0 100644 (file)
@@ -87,7 +87,7 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObjec
                               g1x * tex2D<T>(tex, x1, y1));
 }
 
-/* Fast tricubic texture lookup using 8 bilinear lookups. */
+/* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z)
 {
index d908af78c7a3637936b382428c7311d030963036..faa9dd66d0ebb152d4ce9aea5e5e44fed9375084 100644 (file)
@@ -27,9 +27,21 @@ ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uin
 
 #define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)]
 
-ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+       x %= width;
+       if(x < 0)
+               x += width;
+       return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+       return clamp(x, 0, width-1);
+}
+
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_global TextureInfo *info, int id, int offset)
 {
-       const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
        const int texture_type = kernel_tex_type(id);
 
        /* Float4 */
@@ -55,19 +67,45 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int o
        }
 }
 
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y)
 {
-       x %= width;
-       if(x < 0)
-               x += width;
-       return x;
+       const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+       /* Wrap */
+       if(info->extension == EXTENSION_REPEAT) {
+               x = svm_image_texture_wrap_periodic(x, info->width);
+               y = svm_image_texture_wrap_periodic(y, info->height);
+       }
+       else {
+               x = svm_image_texture_wrap_clamp(x, info->width);
+               y = svm_image_texture_wrap_clamp(y, info->height);
+       }
+
+       int offset = x + info->width * y;
+       return svm_image_texture_read(kg, info, id, offset);
 }
 
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, int x, int y, int z)
 {
-       return clamp(x, 0, width-1);
+       const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+       /* Wrap */
+       if(info->extension == EXTENSION_REPEAT) {
+               x = svm_image_texture_wrap_periodic(x, info->width);
+               y = svm_image_texture_wrap_periodic(y, info->height);
+               z = svm_image_texture_wrap_periodic(z, info->depth);
+       }
+       else {
+               x = svm_image_texture_wrap_clamp(x, info->width);
+               y = svm_image_texture_wrap_clamp(y, info->height);
+               z = svm_image_texture_wrap_clamp(z, info->depth);
+       }
+
+       int offset = x + info->width * y + info->width * info->height * z;
+       return svm_image_texture_read(kg, info, id, offset);
 }
 
+
 ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 {
        int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
@@ -87,107 +125,52 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
 {
        const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
 
-       uint width = info->width;
-       uint height = info->height;
-       uint interpolation = info->interpolation;
-       uint extension = info->extension;
+       if(info->extension == EXTENSION_CLIP) {
+               if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+                       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+               }
+       }
 
-       /* Actual sampling. */
-       if(interpolation == INTERPOLATION_CLOSEST) {
+       if(info->interpolation == INTERPOLATION_CLOSEST) {
+               /* Closest interpolation. */
                int ix, iy;
-               svm_image_texture_frac(x*width, &ix);
-               svm_image_texture_frac(y*height, &iy);
-
-               if(extension == EXTENSION_REPEAT) {
-                       ix = svm_image_texture_wrap_periodic(ix, width);
-                       iy = svm_image_texture_wrap_periodic(iy, height);
-               }
-               else {
-                       if(extension == EXTENSION_CLIP) {
-                               if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-                                       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-                               }
-                       }
-                       /* Fall through. */
-                       /* EXTENSION_EXTEND */
-                       ix = svm_image_texture_wrap_clamp(ix, width);
-                       iy = svm_image_texture_wrap_clamp(iy, height);
-               }
+               svm_image_texture_frac(x*info->width, &ix);
+               svm_image_texture_frac(y*info->height, &iy);
 
-               return svm_image_texture_read(kg, id, ix + iy*width);
+               return svm_image_texture_read_2d(kg, id, ix, iy);
+       }
+       else if(info->interpolation == INTERPOLATION_LINEAR) {
+               /* Bilinear interpolation. */
+               int ix, iy;
+               float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+               float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+
+               float4 r;
+               r =  (1.0f - ty)*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy);
+               r += (1.0f - ty)*tx*svm_image_texture_read_2d(kg, id, ix+1, iy);
+               r += ty*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy+1);
+               r += ty*tx*svm_image_texture_read_2d(kg, id, ix+1, iy+1);
+               return r;
        }
        else {
-               /* Bilinear or bicubic interpolation. */
-               int ix, iy, nix, niy;
-               float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
-               float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
-
-               if(extension == EXTENSION_REPEAT) {
-                       ix = svm_image_texture_wrap_periodic(ix, width);
-                       iy = svm_image_texture_wrap_periodic(iy, height);
-                       nix = svm_image_texture_wrap_periodic(ix+1, width);
-                       niy = svm_image_texture_wrap_periodic(iy+1, height);
-               }
-               else {
-                       if(extension == EXTENSION_CLIP) {
-                               if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-                                       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-                               }
-                       }
-                       ix = svm_image_texture_wrap_clamp(ix, width);
-                       iy = svm_image_texture_wrap_clamp(iy, height);
-                       nix = svm_image_texture_wrap_clamp(ix+1, width);
-                       niy = svm_image_texture_wrap_clamp(iy+1, height);
-               }
-
-               if(interpolation == INTERPOLATION_LINEAR) {
-                       /* Bilinear interpolation. */
-                       float4 r;
-                       r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width);
-                       r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width);
-                       r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width);
-                       r += ty*tx*svm_image_texture_read(kg, id, nix + niy*width);
-                       return r;
-               }
-
                /* Bicubic interpolation. */
-               int pix, piy, nnix, nniy;
-               if(extension == EXTENSION_REPEAT) {
-                       pix = svm_image_texture_wrap_periodic(ix-1, width);
-                       piy = svm_image_texture_wrap_periodic(iy-1, height);
-                       nnix = svm_image_texture_wrap_periodic(ix+2, width);
-                       nniy = svm_image_texture_wrap_periodic(iy+2, height);
-               }
-               else {
-                       pix = svm_image_texture_wrap_clamp(ix-1, width);
-                       piy = svm_image_texture_wrap_clamp(iy-1, height);
-                       nnix = svm_image_texture_wrap_clamp(ix+2, width);
-                       nniy = svm_image_texture_wrap_clamp(iy+2, height);
-               }
+               int ix, iy;
+               float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+               float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
 
-               const int xc[4] = {pix, ix, nix, nnix};
-               const int yc[4] = {width * piy,
-                                  width * iy,
-                                  width * niy,
-                                  width * nniy};
                float u[4], v[4];
-               /* Some helper macro to keep code reasonable size,
-                * let compiler to inline all the matrix multiplications.
-                */
-#define DATA(x, y) (svm_image_texture_read(kg, id, xc[x] + yc[y]))
-#define TERM(col) \
-               (v[col] * (u[0] * DATA(0, col) + \
-                          u[1] * DATA(1, col) + \
-                          u[2] * DATA(2, col) + \
-                          u[3] * DATA(3, col)))
-
                SET_CUBIC_SPLINE_WEIGHTS(u, tx);
                SET_CUBIC_SPLINE_WEIGHTS(v, ty);
 
-               /* Actual interpolation. */
-               return TERM(0) + TERM(1) + TERM(2) + TERM(3);
-#undef TERM
-#undef DATA
+               float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+               for(int y = 0; y < 4; y++) {
+                       for(int x = 0; x < 4; x++) {
+                               float weight = u[x]*v[y];
+                               r += weight*svm_image_texture_read_2d(kg, id, ix+x-1, iy+y-1);
+                       }
+               }
+               return r;
        }
 }
 
@@ -196,145 +179,67 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 {
        const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
 
-       uint width = info->width;
-       uint height = info->height;
-       uint depth = info->depth;
+       if(info->extension == EXTENSION_CLIP) {
+               if(x < 0.0f || y < 0.0f || z < 0.0f ||
+                  x > 1.0f || y > 1.0f || z > 1.0f)
+               {
+                       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+               }
+       }
+
        uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp;
-       uint extension = info->extension;
 
-       /* Actual sampling. */
        if(interpolation == INTERPOLATION_CLOSEST) {
+               /* Closest interpolation. */
                int ix, iy, iz;
-               svm_image_texture_frac(x*width, &ix);
-               svm_image_texture_frac(y*height, &iy);
-               svm_image_texture_frac(z*depth, &iz);
-
-               if(extension == EXTENSION_REPEAT) {
-                       ix = svm_image_texture_wrap_periodic(ix, width);
-                       iy = svm_image_texture_wrap_periodic(iy, height);
-                       iz = svm_image_texture_wrap_periodic(iz, depth);
-               }
-               else {
-                       if(extension == EXTENSION_CLIP) {
-                               if(x < 0.0f || y < 0.0f || z < 0.0f ||
-                                  x > 1.0f || y > 1.0f || z > 1.0f)
-                               {
-                                       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-                               }
-                       }
-                       /* Fall through. */
-                       /* EXTENSION_EXTEND */
-                       ix = svm_image_texture_wrap_clamp(ix, width);
-                       iy = svm_image_texture_wrap_clamp(iy, height);
-                       iz = svm_image_texture_wrap_clamp(iz, depth);
-               }
-               return svm_image_texture_read(kg, id, ix + iy*width + iz*width*height);
+               svm_image_texture_frac(x*info->width, &ix);
+               svm_image_texture_frac(y*info->height, &iy);
+               svm_image_texture_frac(z*info->depth, &iz);
+
+               return svm_image_texture_read_3d(kg, id, ix, iy, iz);
+       }
+       else if(interpolation == INTERPOLATION_LINEAR) {
+               /* Bilinear interpolation. */
+               int ix, iy, iz;
+               float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+               float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+               float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz);
+
+               float4 r;
+               r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz);
+               r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz);
+               r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz);
+               r += (1.0f - tz)*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz);
+
+               r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz+1);
+               r += tz*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz+1);
+               r += tz*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz+1);
+               r += tz*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz+1);
+               return r;
        }
        else {
-               /* Bilinear or bicubic interpolation. */
-               int ix, iy, iz, nix, niy, niz;
-               float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix);
-               float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy);
-               float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz);
-
-               if(extension == EXTENSION_REPEAT) {
-                       ix = svm_image_texture_wrap_periodic(ix, width);
-                       iy = svm_image_texture_wrap_periodic(iy, height);
-                       iz = svm_image_texture_wrap_periodic(iz, depth);
-
-                       nix = svm_image_texture_wrap_periodic(ix+1, width);
-                       niy = svm_image_texture_wrap_periodic(iy+1, height);
-                       niz = svm_image_texture_wrap_periodic(iz+1, depth);
-               }
-               else {
-                       if(extension == EXTENSION_CLIP) {
-                               if(x < 0.0f || y < 0.0f || z < 0.0f ||
-                                  x > 1.0f || y > 1.0f || z > 1.0f)
-                               {
-                                       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-                               }
-                       }
-                       /* Fall through. */
-                       /*  EXTENSION_EXTEND */
-                       nix = svm_image_texture_wrap_clamp(ix+1, width);
-                       niy = svm_image_texture_wrap_clamp(iy+1, height);
-                       niz = svm_image_texture_wrap_clamp(iz+1, depth);
-
-                       ix = svm_image_texture_wrap_clamp(ix, width);
-                       iy = svm_image_texture_wrap_clamp(iy, height);
-                       iz = svm_image_texture_wrap_clamp(iz, depth);
-               }
-
-               if(interpolation == INTERPOLATION_LINEAR) {
-                       /* Bilinear interpolation. */
-                       float4 r;
-                       r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + iz*width*height);
-                       r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + iz*width*height);
-                       r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + iz*width*height);
-                       r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + iz*width*height);
-
-                       r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + niz*width*height);
-                       r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + niz*width*height);
-                       r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + niz*width*height);
-                       r += tz*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + niz*width*height);
-                       return r;
-               }
-
                /* Bicubic interpolation. */
-               int pix, piy, piz, nnix, nniy, nniz;
-               if(extension == EXTENSION_REPEAT) {
-                       pix = svm_image_texture_wrap_periodic(ix-1, width);
-                       piy = svm_image_texture_wrap_periodic(iy-1, height);
-                       piz = svm_image_texture_wrap_periodic(iz-1, depth);
-                       nnix = svm_image_texture_wrap_periodic(ix+2, width);
-                       nniy = svm_image_texture_wrap_periodic(iy+2, height);
-                       nniz = svm_image_texture_wrap_periodic(iz+2, depth);
-               }
-               else {
-                       pix = svm_image_texture_wrap_clamp(ix-1, width);
-                       piy = svm_image_texture_wrap_clamp(iy-1, height);
-                       piz = svm_image_texture_wrap_clamp(iz-1, depth);
-                       nnix = svm_image_texture_wrap_clamp(ix+2, width);
-                       nniy = svm_image_texture_wrap_clamp(iy+2, height);
-                       nniz = svm_image_texture_wrap_clamp(iz+2, depth);
-               }
+               int ix, iy, iz;
+               float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+               float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+               float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz);
 
-               const int xc[4] = {pix, ix, nix, nnix};
-               const int yc[4] = {width * piy,
-                                  width * iy,
-                                  width * niy,
-                                  width * nniy};
-               const int zc[4] = {width * height * piz,
-                                  width * height * iz,
-                                  width * height * niz,
-                                  width * height * nniz};
                float u[4], v[4], w[4];
-
-               /* Some helper macro to keep code reasonable size,
-                * let compiler to inline all the matrix multiplications.
-                */
-#define DATA(x, y, z) (svm_image_texture_read(kg, id, xc[x] + yc[y] + zc[z]))
-#define COL_TERM(col, row) \
-               (v[col] * (u[0] * DATA(0, col, row) + \
-                          u[1] * DATA(1, col, row) + \
-                          u[2] * DATA(2, col, row) + \
-                          u[3] * DATA(3, col, row)))
-#define ROW_TERM(row) \
-               (w[row] * (COL_TERM(0, row) + \
-                          COL_TERM(1, row) + \
-                          COL_TERM(2, row) + \
-                          COL_TERM(3, row)))
-
                SET_CUBIC_SPLINE_WEIGHTS(u, tx);
                SET_CUBIC_SPLINE_WEIGHTS(v, ty);
                SET_CUBIC_SPLINE_WEIGHTS(w, tz);
 
-               /* Actual interpolation. */
-               return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+               float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
-#undef COL_TERM
-#undef ROW_TERM
-#undef DATA
+               for(int z = 0; z < 4; z++) {
+                       for(int y = 0; y < 4; y++) {
+                               for(int x = 0; x < 4; x++) {
+                                       float weight = u[x]*v[y]*w[z];
+                                       r += weight*svm_image_texture_read_3d(kg, id, ix+x-1, iy+y-1, iz+z-1);
+                               }
+                       }
+               }
+               return r;
        }
 }