Cycles: Added Cryptomatte output.
[blender.git] / intern / cycles / kernel / kernels / cuda / kernel.cu
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* CUDA kernel entry points */
18
19 #ifdef __CUDA_ARCH__
20
21 #include "kernel/kernel_compat_cuda.h"
22 #include "kernel_config.h"
23
24 #include "util/util_atomic.h"
25
26 #include "kernel/kernel_math.h"
27 #include "kernel/kernel_types.h"
28 #include "kernel/kernel_globals.h"
29 #include "kernel/kernel_color.h"
30 #include "kernel/kernels/cuda/kernel_cuda_image.h"
31 #include "kernel/kernel_film.h"
32 #include "kernel/kernel_path.h"
33 #include "kernel/kernel_path_branched.h"
34 #include "kernel/kernel_bake.h"
35 #include "kernel/kernel_work_stealing.h"
36
37 /* kernels */
38 extern "C" __global__ void
39 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
40 kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
41 {
42         int work_index = ccl_global_id(0);
43         bool thread_is_active = work_index < total_work_size;
44         uint x, y, sample;
45         KernelGlobals kg;
46         if(thread_is_active) {
47                 get_work_pixel(tile, work_index, &x, &y, &sample);
48
49                 kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
50         }
51
52         if(kernel_data.film.cryptomatte_passes) {
53                 __syncthreads();
54                 if(thread_is_active) {
55                         kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
56                 }
57         }
58 }
59
60 #ifdef __BRANCHED_PATH__
61 extern "C" __global__ void
62 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
63 kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
64 {
65         int work_index = ccl_global_id(0);
66         bool thread_is_active = work_index < total_work_size;
67         uint x, y, sample;
68         KernelGlobals kg;
69         if(thread_is_active) {
70                 get_work_pixel(tile, work_index, &x, &y, &sample);
71
72                 kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
73         }
74         
75         if(kernel_data.film.cryptomatte_passes) {
76                 __syncthreads();
77                 if(thread_is_active) {
78                         kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
79                 }
80         }
81 }
82 #endif
83
84 extern "C" __global__ void
85 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
86 kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
87 {
88         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
89         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
90
91         if(x < sx + sw && y < sy + sh) {
92                 kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
93         }
94 }
95
96 extern "C" __global__ void
97 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
98 kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
99 {
100         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
101         int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
102
103         if(x < sx + sw && y < sy + sh) {
104                 kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
105         }
106 }
107
108 extern "C" __global__ void
109 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
110 kernel_cuda_displace(uint4 *input,
111                      float4 *output,
112                      int type,
113                      int sx,
114                      int sw,
115                      int offset,
116                      int sample)
117 {
118         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
119
120         if(x < sx + sw) {
121                 KernelGlobals kg;
122                 kernel_displace_evaluate(&kg, input, output, x);
123         }
124 }
125
126 extern "C" __global__ void
127 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
128 kernel_cuda_background(uint4 *input,
129                        float4 *output,
130                        int type,
131                        int sx,
132                        int sw,
133                        int offset,
134                        int sample)
135 {
136         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
137
138         if(x < sx + sw) {
139                 KernelGlobals kg;
140                 kernel_background_evaluate(&kg, input, output, x);
141         }
142 }
143
144 #ifdef __BAKING__
145 extern "C" __global__ void
146 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
147 kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int sw, int offset, int sample)
148 {
149         int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
150
151         if(x < sx + sw) {
152                 KernelGlobals kg;
153                 kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
154         }
155 }
156 #endif
157
158 #endif
159