Code refactor: add WorkTile struct for passing work to kernel.
[blender-staging.git] / intern / cycles / kernel / kernels / cuda / kernel.cu
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* CUDA kernel entry points */
18
19 #ifdef __CUDA_ARCH__
20
21 #include "kernel/kernel_compat_cuda.h"
22 #include "kernel_config.h"
23
24 #include "kernel/kernel_math.h"
25 #include "kernel/kernel_types.h"
26 #include "kernel/kernel_globals.h"
27 #include "kernel/kernel_film.h"
28 #include "kernel/kernel_path.h"
29 #include "kernel/kernel_path_branched.h"
30 #include "kernel/kernel_bake.h"
31 #include "kernel/kernel_work_stealing.h"
32
33 /* kernels */
34 extern "C" __global__ void
35 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
36 kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
37 {
38         int work_index = ccl_global_id(0);
39
40         if(work_index < total_work_size) {
41                 uint x, y, sample;
42                 get_work_pixel(tile, work_index, &x, &y, &sample);
43
44                 KernelGlobals kg;
45                 kernel_path_trace(&kg, tile->buffer, tile->rng_state, sample, x, y, tile->offset, tile->stride);
46         }
47 }
48
49 #ifdef __BRANCHED_PATH__
50 extern "C" __global__ void
51 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
52 kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
53 {
54         int work_index = ccl_global_id(0);
55
56         if(work_index < total_work_size) {
57                 uint x, y, sample;
58                 get_work_pixel(tile, work_index, &x, &y, &sample);
59
60                 KernelGlobals kg;
61                 kernel_branched_path_trace(&kg, tile->buffer, tile->rng_state, sample, x, y, tile->offset, tile->stride);
62         }
63 }
64 #endif
65
66 extern "C" __global__ void
67 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
68 kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
69 {
70         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
71         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
72
73         if(x < sx + sw && y < sy + sh) {
74                 kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
75         }
76 }
77
78 extern "C" __global__ void
79 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
80 kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
81 {
82         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
83         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
84
85         if(x < sx + sw && y < sy + sh) {
86                 kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
87         }
88 }
89
90 extern "C" __global__ void
91 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
92 kernel_cuda_shader(uint4 *input,
93                    float4 *output,
94                    float *output_luma,
95                    int type,
96                    int sx,
97                    int sw,
98                    int offset,
99                    int sample)
100 {
101         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
102
103         if(x < sx + sw) {
104                 KernelGlobals kg;
105                 kernel_shader_evaluate(&kg,
106                                        input,
107                                        output,
108                                        output_luma,
109                                        (ShaderEvalType)type, 
110                                        x,
111                                        sample);
112         }
113 }
114
115 #ifdef __BAKING__
116 extern "C" __global__ void
117 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
118 kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int sw, int offset, int sample)
119 {
120         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
121
122         if(x < sx + sw) {
123                 KernelGlobals kg;
124                 kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
125         }
126 }
127 #endif
128
129 #endif
130