Cycles: Make kernel compilable for 3.7 compute capability
[blender.git] / intern / cycles / kernel / kernels / cuda / kernel.cu
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* CUDA kernel entry points */
18
19 #include "../../kernel_compat_cuda.h"
20 #include "../../kernel_math.h"
21 #include "../../kernel_types.h"
22 #include "../../kernel_globals.h"
23 #include "../../kernel_film.h"
24 #include "../../kernel_path.h"
25 #include "../../kernel_path_branched.h"
26 #include "../../kernel_bake.h"
27
28 /* device data taken from CUDA occupancy calculator */
29
30 #ifdef __CUDA_ARCH__
31
32 /* 2.0 and 2.1 */
33 #if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
34 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
35 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
36 #define CUDA_BLOCK_MAX_THREADS 1024
37 #define CUDA_THREAD_MAX_REGISTERS 63
38
39 /* tunable parameters */
40 #define CUDA_THREADS_BLOCK_WIDTH 16
41 #define CUDA_KERNEL_MAX_REGISTERS 32
42 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
43
44 /* 3.0 and 3.5 */
45 #elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
46 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
47 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
48 #define CUDA_BLOCK_MAX_THREADS 1024
49 #define CUDA_THREAD_MAX_REGISTERS 63
50
51 /* tunable parameters */
52 #define CUDA_THREADS_BLOCK_WIDTH 16
53 #define CUDA_KERNEL_MAX_REGISTERS 63
54 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
55
56 /* 3.2 */
57 #elif __CUDA_ARCH__ == 320
58 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
59 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
60 #define CUDA_BLOCK_MAX_THREADS 1024
61 #define CUDA_THREAD_MAX_REGISTERS 63
62
63 /* tunable parameters */
64 #define CUDA_THREADS_BLOCK_WIDTH 16
65 #define CUDA_KERNEL_MAX_REGISTERS 63
66 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
67
68 /* 3.7 */
69 #elif __CUDA_ARCH__ == 370
70 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
71 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
72 #define CUDA_BLOCK_MAX_THREADS 1024
73 #define CUDA_THREAD_MAX_REGISTERS 255
74
75 /* tunable parameters */
76 #define CUDA_THREADS_BLOCK_WIDTH 16
77 #define CUDA_KERNEL_MAX_REGISTERS 63
78 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
79
80 /* 5.0, 5.2 and 5.3 */
81 #elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520 || __CUDA_ARCH__ == 530
82 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
83 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
84 #define CUDA_BLOCK_MAX_THREADS 1024
85 #define CUDA_THREAD_MAX_REGISTERS 255
86
87 /* tunable parameters */
88 #define CUDA_THREADS_BLOCK_WIDTH 16
89 #define CUDA_KERNEL_MAX_REGISTERS 40
90 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
91
92 /* unknown architecture */
93 #else
94 #error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
95 #endif
96
97 /* compute number of threads per block and minimum blocks per multiprocessor
98  * given the maximum number of registers per thread */
99
100 #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
101         __launch_bounds__( \
102                 threads_block_width*threads_block_width, \
103                 CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
104                 )
105
106 /* sanity checks */
107
108 #if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
109 #error "Maximum number of threads per block exceeded"
110 #endif
111
112 #if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
113 #error "Maximum number of blocks per multiprocessor exceeded"
114 #endif
115
116 #if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
117 #error "Maximum number of registers per thread exceeded"
118 #endif
119
120 #if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
121 #error "Maximum number of registers per thread exceeded"
122 #endif
123
124 /* kernels */
125
126 extern "C" __global__ void
127 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
128 kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
129 {
130         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
131         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
132
133         if(x < sx + sw && y < sy + sh)
134                 kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
135 }
136
137 #ifdef __BRANCHED_PATH__
138 extern "C" __global__ void
139 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
140 kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
141 {
142         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
143         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
144
145         if(x < sx + sw && y < sy + sh)
146                 kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
147 }
148 #endif
149
150 extern "C" __global__ void
151 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
152 kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
153 {
154         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
155         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
156
157         if(x < sx + sw && y < sy + sh)
158                 kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
159 }
160
161 extern "C" __global__ void
162 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
163 kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
164 {
165         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
166         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
167
168         if(x < sx + sw && y < sy + sh)
169                 kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
170 }
171
172 extern "C" __global__ void
173 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
174 kernel_cuda_shader(uint4 *input,
175                    float4 *output,
176                    float *output_luma,
177                    int type,
178                    int sx,
179                    int sw,
180                    int offset,
181                    int sample)
182 {
183         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
184
185         if(x < sx + sw) {
186                 kernel_shader_evaluate(NULL,
187                                        input,
188                                        output,
189                                        output_luma,
190                                        (ShaderEvalType)type, 
191                                        x,
192                                        sample);
193         }
194 }
195
196 extern "C" __global__ void
197 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
198 kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int sw, int offset, int sample)
199 {
200         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
201
202         if(x < sx + sw)
203                 kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample);
204 }
205
206 #endif
207