Code refactor: add WorkTile struct for passing work to kernel.
[blender-staging.git] / intern / cycles / kernel / kernel_work_stealing.h
index 7d559b1aa314959dd516066a02dd4347fd2fbc53..0c2d9379b63b5987ef1ed128a2b0ad11f6f386e6 100644 (file)
 #ifndef __KERNEL_WORK_STEALING_H__
 #define __KERNEL_WORK_STEALING_H__
 
 #ifndef __KERNEL_WORK_STEALING_H__
 #define __KERNEL_WORK_STEALING_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Utility functions for work stealing
  */
 
 /*
  * Utility functions for work stealing
  */
 
-#ifdef __WORK_STEALING__
-
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-uint get_group_id_with_ray_index(uint ray_index,
-                                 uint tile_dim_x,
-                                 uint tile_dim_y,
-                                 uint parallel_samples,
-                                 int dim)
+#ifdef __SPLIT_KERNEL__
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              ccl_global uint *work_pools,
+                              uint total_work_size,
+                              uint ray_index,
+                              ccl_private uint *global_work_index)
 {
 {
-       if(dim == 0) {
-               uint x_span = ray_index % (tile_dim_x * parallel_samples);
-               return x_span / get_local_size(0);
+       /* With a small amount of work there may be more threads than work due to
+        * rounding up of global size, stop such threads immediately. */
+       if(ray_index >= total_work_size) {
+               return false;
        }
        }
-       else /*if(dim == 1)*/ {
-               kernel_assert(dim == 1);
-               uint y_span = ray_index / (tile_dim_x * parallel_samples);
-               return y_span / get_local_size(1);
-       }
-}
-
-uint get_total_work(uint tile_dim_x,
-                    uint tile_dim_y,
-                    uint grp_idx,
-                    uint grp_idy,
-                    uint num_samples)
-{
-       uint threads_within_tile_border_x =
-               (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-                                                    : get_local_size(0);
-       uint threads_within_tile_border_y =
-               (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-                                                    : get_local_size(1);
-
-       threads_within_tile_border_x =
-               (threads_within_tile_border_x == 0) ? get_local_size(0)
-                                                   : threads_within_tile_border_x;
-       threads_within_tile_border_y =
-               (threads_within_tile_border_y == 0) ? get_local_size(1)
-                                                   : threads_within_tile_border_y;
-
-       return threads_within_tile_border_x *
-              threads_within_tile_border_y *
-              num_samples;
-}
 
 
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-int get_next_work(ccl_global uint *work_pool,
-                  ccl_private uint *my_work,
-                  uint tile_dim_x,
-                  uint tile_dim_y,
-                  uint num_samples,
-                  uint parallel_samples,
-                  uint ray_index)
-{
-       uint grp_idx = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  parallel_samples,
-                                                  0);
-       uint grp_idy = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  parallel_samples,
-                                                  1);
-       uint total_work = get_total_work(tile_dim_x,
-                                        tile_dim_y,
-                                        grp_idx,
-                                        grp_idy,
-                                        num_samples);
-       uint group_index = grp_idy * get_num_groups(0) + grp_idx;
-       *my_work = atomic_inc(&work_pool[group_index]);
-       return (*my_work < total_work) ? 1 : 0;
-}
+       /* Increase atomic work index counter in pool. */
+       uint pool = ray_index / WORK_POOL_SIZE;
+       uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
 
 
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-uint get_my_sample(uint my_work,
-                   uint tile_dim_x,
-                   uint tile_dim_y,
-                   uint parallel_samples,
-                   uint ray_index)
-{
-       uint grp_idx = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  parallel_samples,
-                                                  0);
-       uint grp_idy = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  parallel_samples,
-                                                  1);
-       uint threads_within_tile_border_x =
-               (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-                                                    : get_local_size(0);
-       uint threads_within_tile_border_y =
-               (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-                                                    : get_local_size(1);
+       /* Map per-pool work index to a global work index. */
+       uint global_size = ccl_global_size(0) * ccl_global_size(1);
+       kernel_assert(global_size % WORK_POOL_SIZE == 0);
+       kernel_assert(ray_index < global_size);
 
 
-       threads_within_tile_border_x =
-               (threads_within_tile_border_x == 0) ? get_local_size(0)
-                                                   : threads_within_tile_border_x;
-       threads_within_tile_border_y =
-               (threads_within_tile_border_y == 0) ? get_local_size(1)
-                                                   : threads_within_tile_border_y;
+       *global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+                          + (pool * WORK_POOL_SIZE)
+                          + (work_index % WORK_POOL_SIZE);
 
 
-       return my_work /
-              (threads_within_tile_border_x * threads_within_tile_border_y);
+       /* Test if all work for this pool is done. */
+       return (*global_work_index < total_work_size);
 }
 }
+#endif
 
 
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-void get_pixel_tile_position(ccl_private uint *pixel_x,
-                             ccl_private uint *pixel_y,
-                             ccl_private uint *tile_x,
-                             ccl_private uint *tile_y,
-                             uint my_work,
-                             uint tile_dim_x,
-                             uint tile_dim_y,
-                             uint tile_offset_x,
-                             uint tile_offset_y,
-                             uint parallel_samples,
-                             uint ray_index)
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
 {
 {
-       uint grp_idx = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  parallel_samples,
-                                                  0);
-       uint grp_idy = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  parallel_samples,
-                                                  1);
-       uint threads_within_tile_border_x =
-               (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-                                                    : get_local_size(0);
-       uint threads_within_tile_border_y =
-               (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-                                                    : get_local_size(1);
-
-       threads_within_tile_border_x =
-               (threads_within_tile_border_x == 0) ? get_local_size(0)
-                                                   : threads_within_tile_border_x;
-       threads_within_tile_border_y =
-               (threads_within_tile_border_y == 0) ? get_local_size(1)
-                                                   : threads_within_tile_border_y;
-
-       uint total_associated_pixels =
-               threads_within_tile_border_x * threads_within_tile_border_y;
-       uint work_group_pixel_index = my_work % total_associated_pixels;
-       uint work_group_pixel_x =
-               work_group_pixel_index % threads_within_tile_border_x;
-       uint work_group_pixel_y =
-               work_group_pixel_index / threads_within_tile_border_x;
-
-       *pixel_x =
-               tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
-       *pixel_y =
-               tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
-       *tile_x = *pixel_x - tile_offset_x;
-       *tile_y = *pixel_y - tile_offset_y;
+       uint tile_pixels = tile->w * tile->h;
+       uint sample_offset = global_work_index / tile_pixels;
+       uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+       uint y_offset = pixel_offset / tile->w;
+       uint x_offset = pixel_offset - y_offset * tile->w;
+
+       *x = tile->x + x_offset;
+       *y = tile->y + y_offset;
+       *sample = tile->start_sample + sample_offset;
 }
 
 }
 
-#endif  /* __WORK_STEALING__ */
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_WORK_STEALING_H__ */
 
 #endif  /* __KERNEL_WORK_STEALING_H__ */