c8940b981bbf58d0cdf60031daf1198369aff6ce
[blender.git] / intern / cycles / kernel / kernels / cuda / kernel.cu
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* CUDA kernel entry points */
18
19 #include "../../kernel_compat_cuda.h"
20 #include "../../kernel_math.h"
21 #include "../../kernel_types.h"
22 #include "../../kernel_globals.h"
23 #include "../../kernel_film.h"
24 #include "../../kernel_path.h"
25 #include "../../kernel_path_branched.h"
26 #include "../../kernel_bake.h"
27
28 /* device data taken from CUDA occupancy calculator */
29
30 #ifdef __CUDA_ARCH__
31
32 /* 2.0 and 2.1 */
33 #if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
34 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
35 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
36 #define CUDA_BLOCK_MAX_THREADS 1024
37 #define CUDA_THREAD_MAX_REGISTERS 63
38
39 /* tunable parameters */
40 #define CUDA_THREADS_BLOCK_WIDTH 16
41 #define CUDA_KERNEL_MAX_REGISTERS 32
42 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
43
44 /* 3.0 and 3.5 */
45 #elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
46 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
47 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
48 #define CUDA_BLOCK_MAX_THREADS 1024
49 #define CUDA_THREAD_MAX_REGISTERS 63
50
51 /* tunable parameters */
52 #define CUDA_THREADS_BLOCK_WIDTH 16
53 #define CUDA_KERNEL_MAX_REGISTERS 63
54 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
55
56 /* 3.2 */
57 #elif __CUDA_ARCH__ == 320
58 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
59 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
60 #define CUDA_BLOCK_MAX_THREADS 1024
61 #define CUDA_THREAD_MAX_REGISTERS 63
62
63 /* tunable parameters */
64 #define CUDA_THREADS_BLOCK_WIDTH 16
65 #define CUDA_KERNEL_MAX_REGISTERS 63
66 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
67
68 /* 5.0, 5.2 and 5.3 */
69 #elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520 || __CUDA_ARCH__ == 530
70 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
71 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
72 #define CUDA_BLOCK_MAX_THREADS 1024
73 #define CUDA_THREAD_MAX_REGISTERS 255
74
75 /* tunable parameters */
76 #define CUDA_THREADS_BLOCK_WIDTH 16
77 #define CUDA_KERNEL_MAX_REGISTERS 40
78 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
79
80 /* unknown architecture */
81 #else
82 #error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
83 #endif
84
85 /* compute number of threads per block and minimum blocks per multiprocessor
86  * given the maximum number of registers per thread */
87
88 #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
89         __launch_bounds__( \
90                 threads_block_width*threads_block_width, \
91                 CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
92                 )
93
94 /* sanity checks */
95
96 #if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
97 #error "Maximum number of threads per block exceeded"
98 #endif
99
100 #if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
101 #error "Maximum number of blocks per multiprocessor exceeded"
102 #endif
103
104 #if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
105 #error "Maximum number of registers per thread exceeded"
106 #endif
107
108 #if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
109 #error "Maximum number of registers per thread exceeded"
110 #endif
111
112 /* kernels */
113
114 extern "C" __global__ void
115 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
116 kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
117 {
118         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
119         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
120
121         if(x < sx + sw && y < sy + sh)
122                 kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
123 }
124
125 #ifdef __BRANCHED_PATH__
126 extern "C" __global__ void
127 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
128 kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
129 {
130         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
131         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
132
133         if(x < sx + sw && y < sy + sh)
134                 kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
135 }
136 #endif
137
138 extern "C" __global__ void
139 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
140 kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
141 {
142         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
143         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
144
145         if(x < sx + sw && y < sy + sh)
146                 kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
147 }
148
149 extern "C" __global__ void
150 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
151 kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
152 {
153         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
154         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
155
156         if(x < sx + sw && y < sy + sh)
157                 kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
158 }
159
160 extern "C" __global__ void
161 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
162 kernel_cuda_shader(uint4 *input,
163                    float4 *output,
164                    float *output_luma,
165                    int type,
166                    int sx,
167                    int sw,
168                    int offset,
169                    int sample)
170 {
171         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
172
173         if(x < sx + sw) {
174                 kernel_shader_evaluate(NULL,
175                                        input,
176                                        output,
177                                        output_luma,
178                                        (ShaderEvalType)type, 
179                                        x,
180                                        sample);
181         }
182 }
183
184 extern "C" __global__ void
185 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
186 kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int sw, int offset, int sample)
187 {
188         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
189
190         if(x < sx + sw)
191                 kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample);
192 }
193
194 #endif
195