Cycles: Remove few function arguments needed only for the split kernel
[blender.git] / intern / cycles / kernel / kernel_work_stealing.h
1 /*
2  * Copyright 2011-2015 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifndef __KERNEL_WORK_STEALING_H__
18 #define __KERNEL_WORK_STEALING_H__
19
20 /*
21  * Utility functions for work stealing
22  */
23
24 #ifdef __WORK_STEALING__
25
26 #ifdef __KERNEL_OPENCL__
27 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
28 #endif
29
30 uint get_group_id_with_ray_index(uint ray_index,
31                                  uint tile_dim_x,
32                                  uint tile_dim_y,
33                                  uint parallel_samples,
34                                  int dim)
35 {
36         if(dim == 0) {
37                 uint x_span = ray_index % (tile_dim_x * parallel_samples);
38                 return x_span / get_local_size(0);
39         }
40         else /*if(dim == 1)*/ {
41                 kernel_assert(dim == 1);
42                 uint y_span = ray_index / (tile_dim_x * parallel_samples);
43                 return y_span / get_local_size(1);
44         }
45 }
46
47 uint get_total_work(uint tile_dim_x,
48                     uint tile_dim_y,
49                     uint grp_idx,
50                     uint grp_idy,
51                     uint num_samples)
52 {
53         uint threads_within_tile_border_x =
54                 (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
55                                                      : get_local_size(0);
56         uint threads_within_tile_border_y =
57                 (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
58                                                      : get_local_size(1);
59
60         threads_within_tile_border_x =
61                 (threads_within_tile_border_x == 0) ? get_local_size(0)
62                                                     : threads_within_tile_border_x;
63         threads_within_tile_border_y =
64                 (threads_within_tile_border_y == 0) ? get_local_size(1)
65                                                     : threads_within_tile_border_y;
66
67         return threads_within_tile_border_x *
68                threads_within_tile_border_y *
69                num_samples;
70 }
71
72 /* Returns 0 in case there is no next work available */
73 /* Returns 1 in case work assigned is valid */
74 int get_next_work(ccl_global uint *work_pool,
75                   ccl_private uint *my_work,
76                   uint tile_dim_x,
77                   uint tile_dim_y,
78                   uint num_samples,
79                   uint parallel_samples,
80                   uint ray_index)
81 {
82         uint grp_idx = get_group_id_with_ray_index(ray_index,
83                                                    tile_dim_x,
84                                                    tile_dim_y,
85                                                    parallel_samples,
86                                                    0);
87         uint grp_idy = get_group_id_with_ray_index(ray_index,
88                                                    tile_dim_x,
89                                                    tile_dim_y,
90                                                    parallel_samples,
91                                                    1);
92         uint total_work = get_total_work(tile_dim_x,
93                                          tile_dim_y,
94                                          grp_idx,
95                                          grp_idy,
96                                          num_samples);
97         uint group_index = grp_idy * get_num_groups(0) + grp_idx;
98         *my_work = atomic_inc(&work_pool[group_index]);
99         return (*my_work < total_work) ? 1 : 0;
100 }
101
102 /* This function assumes that the passed my_work is valid. */
103 /* Decode sample number w.r.t. assigned my_work. */
104 uint get_my_sample(uint my_work,
105                    uint tile_dim_x,
106                    uint tile_dim_y,
107                    uint parallel_samples,
108                    uint ray_index)
109 {
110         uint grp_idx = get_group_id_with_ray_index(ray_index,
111                                                    tile_dim_x,
112                                                    tile_dim_y,
113                                                    parallel_samples,
114                                                    0);
115         uint grp_idy = get_group_id_with_ray_index(ray_index,
116                                                    tile_dim_x,
117                                                    tile_dim_y,
118                                                    parallel_samples,
119                                                    1);
120         uint threads_within_tile_border_x =
121                 (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
122                                                      : get_local_size(0);
123         uint threads_within_tile_border_y =
124                 (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
125                                                      : get_local_size(1);
126
127         threads_within_tile_border_x =
128                 (threads_within_tile_border_x == 0) ? get_local_size(0)
129                                                     : threads_within_tile_border_x;
130         threads_within_tile_border_y =
131                 (threads_within_tile_border_y == 0) ? get_local_size(1)
132                                                     : threads_within_tile_border_y;
133
134         return my_work /
135                (threads_within_tile_border_x * threads_within_tile_border_y);
136 }
137
138 /* Decode pixel and tile position w.r.t. assigned my_work. */
139 void get_pixel_tile_position(ccl_private uint *pixel_x,
140                              ccl_private uint *pixel_y,
141                              ccl_private uint *tile_x,
142                              ccl_private uint *tile_y,
143                              uint my_work,
144                              uint tile_dim_x,
145                              uint tile_dim_y,
146                              uint tile_offset_x,
147                              uint tile_offset_y,
148                              uint parallel_samples,
149                              uint ray_index)
150 {
151         uint grp_idx = get_group_id_with_ray_index(ray_index,
152                                                    tile_dim_x,
153                                                    tile_dim_y,
154                                                    parallel_samples,
155                                                    0);
156         uint grp_idy = get_group_id_with_ray_index(ray_index,
157                                                    tile_dim_x,
158                                                    tile_dim_y,
159                                                    parallel_samples,
160                                                    1);
161         uint threads_within_tile_border_x =
162                 (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
163                                                      : get_local_size(0);
164         uint threads_within_tile_border_y =
165                 (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
166                                                      : get_local_size(1);
167
168         threads_within_tile_border_x =
169                 (threads_within_tile_border_x == 0) ? get_local_size(0)
170                                                     : threads_within_tile_border_x;
171         threads_within_tile_border_y =
172                 (threads_within_tile_border_y == 0) ? get_local_size(1)
173                                                     : threads_within_tile_border_y;
174
175         uint total_associated_pixels =
176                 threads_within_tile_border_x * threads_within_tile_border_y;
177         uint work_group_pixel_index = my_work % total_associated_pixels;
178         uint work_group_pixel_x =
179                 work_group_pixel_index % threads_within_tile_border_x;
180         uint work_group_pixel_y =
181                 work_group_pixel_index / threads_within_tile_border_x;
182
183         *pixel_x =
184                 tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
185         *pixel_y =
186                 tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
187         *tile_x = *pixel_x - tile_offset_x;
188         *tile_y = *pixel_y - tile_offset_y;
189 }
190
191 #endif  /* __WORK_STEALING__ */
192
193 #endif  /* __KERNEL_WORK_STEALING_H__ */