Geometry Nodes: new Material input node
[blender.git] / intern / cycles / device / device_optix.cpp
1 /*
2  * Copyright 2019, NVIDIA Corporation.
3  * Copyright 2019, Blender Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 #ifdef WITH_OPTIX
19
20 #  include "bvh/bvh.h"
21 #  include "bvh/bvh_optix.h"
22 #  include "device/cuda/device_cuda.h"
23 #  include "device/device_denoising.h"
24 #  include "device/device_intern.h"
25 #  include "render/buffers.h"
26 #  include "render/hair.h"
27 #  include "render/mesh.h"
28 #  include "render/object.h"
29 #  include "render/scene.h"
30 #  include "util/util_debug.h"
31 #  include "util/util_logging.h"
32 #  include "util/util_md5.h"
33 #  include "util/util_path.h"
34 #  include "util/util_progress.h"
35 #  include "util/util_time.h"
36
37 #  ifdef WITH_CUDA_DYNLOAD
38 #    include <cuew.h>
39 // Do not use CUDA SDK headers when using CUEW
40 #    define OPTIX_DONT_INCLUDE_CUDA
41 #  endif
42 #  include <optix_function_table_definition.h>
43 #  include <optix_stubs.h>
44
45 // TODO(pmours): Disable this once drivers have native support
46 #  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
47
48 CCL_NAMESPACE_BEGIN
49
50 /* Make sure this stays in sync with kernel_globals.h */
51 struct ShaderParams {
52   uint4 *input;
53   float4 *output;
54   int type;
55   int filter;
56   int sx;
57   int offset;
58   int sample;
59 };
60 struct KernelParams {
61   WorkTile tile;
62   KernelData data;
63   ShaderParams shader;
64 #  define KERNEL_TEX(type, name) const type *name;
65 #  include "kernel/kernel_textures.h"
66 #  undef KERNEL_TEX
67 };
68
69 #  define check_result_cuda(stmt) \
70     { \
71       CUresult res = stmt; \
72       if (res != CUDA_SUCCESS) { \
73         const char *name; \
74         cuGetErrorName(res, &name); \
75         set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
76         return; \
77       } \
78     } \
79     (void)0
80 #  define check_result_cuda_ret(stmt) \
81     { \
82       CUresult res = stmt; \
83       if (res != CUDA_SUCCESS) { \
84         const char *name; \
85         cuGetErrorName(res, &name); \
86         set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
87         return false; \
88       } \
89     } \
90     (void)0
91
92 #  define check_result_optix(stmt) \
93     { \
94       enum OptixResult res = stmt; \
95       if (res != OPTIX_SUCCESS) { \
96         const char *name = optixGetErrorName(res); \
97         set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
98         return; \
99       } \
100     } \
101     (void)0
102 #  define check_result_optix_ret(stmt) \
103     { \
104       enum OptixResult res = stmt; \
105       if (res != OPTIX_SUCCESS) { \
106         const char *name = optixGetErrorName(res); \
107         set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
108         return false; \
109       } \
110     } \
111     (void)0
112
113 #  define launch_filter_kernel(func_name, w, h, args) \
114     { \
115       CUfunction func; \
116       check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
117       check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
118       int threads; \
119       check_result_cuda_ret( \
120           cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
121       threads = (int)sqrt((float)threads); \
122       int xblocks = ((w) + threads - 1) / threads; \
123       int yblocks = ((h) + threads - 1) / threads; \
124       check_result_cuda_ret( \
125           cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
126     } \
127     (void)0
128
129 class OptiXDevice : public CUDADevice {
130
131   // List of OptiX program groups
132   enum {
133     PG_RGEN,
134     PG_MISS,
135     PG_HITD,  // Default hit group
136     PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
137     PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
138 #  if OPTIX_ABI_VERSION >= 36
139     PG_HITD_MOTION,
140     PG_HITS_MOTION,
141 #  endif
142     PG_BAKE,  // kernel_bake_evaluate
143     PG_DISP,  // kernel_displace_evaluate
144     PG_BACK,  // kernel_background_evaluate
145     PG_CALL,
146     NUM_PROGRAM_GROUPS = PG_CALL + 3
147   };
148
149   // List of OptiX pipelines
150   enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
151
152   // A single shader binding table entry
153   struct SbtRecord {
154     char header[OPTIX_SBT_RECORD_HEADER_SIZE];
155   };
156
157   // Information stored about CUDA memory allocations
158   struct CUDAMem {
159     bool free_map_host = false;
160     CUarray array = NULL;
161     CUtexObject texobject = 0;
162     bool use_mapped_host = false;
163   };
164
165   // Helper class to manage current CUDA context
166   struct CUDAContextScope {
167     CUDAContextScope(CUcontext ctx)
168     {
169       cuCtxPushCurrent(ctx);
170     }
171     ~CUDAContextScope()
172     {
173       cuCtxPopCurrent(NULL);
174     }
175   };
176
177   // Use a pool with multiple threads to support launches with multiple CUDA streams
178   TaskPool task_pool;
179
180   vector<CUstream> cuda_stream;
181   OptixDeviceContext context = NULL;
182
183   OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
184   OptixModule builtin_modules[2] = {};
185   OptixPipeline pipelines[NUM_PIPELINES] = {};
186
187   bool motion_blur = false;
188   device_vector<SbtRecord> sbt_data;
189   device_only_memory<KernelParams> launch_params;
190   OptixTraversableHandle tlas_handle = 0;
191
192   OptixDenoiser denoiser = NULL;
193   device_only_memory<unsigned char> denoiser_state;
194   int denoiser_input_passes = 0;
195
196  public:
197   OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
198       : CUDADevice(info_, stats_, profiler_, background_),
199         sbt_data(this, "__sbt", MEM_READ_ONLY),
200         launch_params(this, "__params", false),
201         denoiser_state(this, "__denoiser_state", true)
202   {
203     // Store number of CUDA streams in device info
204     info.cpu_threads = DebugFlags().optix.cuda_streams;
205
206     // Make the CUDA context current
207     if (!cuContext) {
208       return;  // Do not initialize if CUDA context creation failed already
209     }
210     const CUDAContextScope scope(cuContext);
211
212     // Create OptiX context for this device
213     OptixDeviceContextOptions options = {};
214 #  ifdef WITH_CYCLES_LOGGING
215     options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
216     options.logCallbackFunction =
217         [](unsigned int level, const char *, const char *message, void *) {
218           switch (level) {
219             case 1:
220               LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
221               break;
222             case 2:
223               LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
224               break;
225             case 3:
226               LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
227               break;
228             case 4:
229               LOG_IF(INFO, VLOG_IS_ON(1)) << message;
230               break;
231           }
232         };
233 #  endif
234 #  if OPTIX_ABI_VERSION >= 41 && defined(WITH_CYCLES_DEBUG)
235     options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
236 #  endif
237     check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
238 #  ifdef WITH_CYCLES_LOGGING
239     check_result_optix(optixDeviceContextSetLogCallback(
240         context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
241 #  endif
242
243     // Create launch streams
244     cuda_stream.resize(info.cpu_threads);
245     for (int i = 0; i < info.cpu_threads; ++i)
246       check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
247
248     // Fix weird compiler bug that assigns wrong size
249     launch_params.data_elements = sizeof(KernelParams);
250     // Allocate launch parameter buffer memory on device
251     launch_params.alloc_to_device(info.cpu_threads);
252   }
253   ~OptiXDevice()
254   {
255     // Stop processing any more tasks
256     task_pool.cancel();
257
258     // Make CUDA context current
259     const CUDAContextScope scope(cuContext);
260
261     sbt_data.free();
262     texture_info.free();
263     launch_params.free();
264     denoiser_state.free();
265
266     // Unload modules
267     if (optix_module != NULL)
268       optixModuleDestroy(optix_module);
269     for (unsigned int i = 0; i < 2; ++i)
270       if (builtin_modules[i] != NULL)
271         optixModuleDestroy(builtin_modules[i]);
272     for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
273       if (pipelines[i] != NULL)
274         optixPipelineDestroy(pipelines[i]);
275
276     // Destroy launch streams
277     for (CUstream stream : cuda_stream)
278       cuStreamDestroy(stream);
279
280     if (denoiser != NULL)
281       optixDenoiserDestroy(denoiser);
282
283     optixDeviceContextDestroy(context);
284   }
285
286  private:
287   bool show_samples() const override
288   {
289     // Only show samples if not rendering multiple tiles in parallel
290     return info.cpu_threads == 1;
291   }
292
293   BVHLayoutMask get_bvh_layout_mask() const override
294   {
295     // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
296     if (optix_module == NULL)
297       return CUDADevice::get_bvh_layout_mask();
298
299     // OptiX has its own internal acceleration structure format
300     return BVH_LAYOUT_OPTIX;
301   }
302
303   string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
304                                           bool filter,
305                                           bool /*split*/) override
306   {
307     // Split kernel is not supported in OptiX
308     string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
309         requested_features, filter, false);
310
311     // Add OptiX SDK include directory to include paths
312     const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
313     if (optix_sdk_path) {
314       common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
315     }
316
317     // Specialization for shader raytracing
318     if (requested_features.use_shader_raytrace) {
319       common_cflags += " --keep-device-functions";
320     }
321     else {
322       common_cflags += " -D __NO_SHADER_RAYTRACE__";
323     }
324
325     return common_cflags;
326   }
327
328   bool load_kernels(const DeviceRequestedFeatures &requested_features) override
329   {
330     if (have_error()) {
331       // Abort early if context creation failed already
332       return false;
333     }
334
335     // Load CUDA modules because we need some of the utility kernels
336     if (!CUDADevice::load_kernels(requested_features)) {
337       return false;
338     }
339
340     // Baking is currently performed using CUDA, so no need to load OptiX kernels
341     if (requested_features.use_baking) {
342       return true;
343     }
344
345     const CUDAContextScope scope(cuContext);
346
347     // Unload existing OptiX module and pipelines first
348     if (optix_module != NULL) {
349       optixModuleDestroy(optix_module);
350       optix_module = NULL;
351     }
352     for (unsigned int i = 0; i < 2; ++i) {
353       if (builtin_modules[i] != NULL) {
354         optixModuleDestroy(builtin_modules[i]);
355         builtin_modules[i] = NULL;
356       }
357     }
358     for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
359       if (pipelines[i] != NULL) {
360         optixPipelineDestroy(pipelines[i]);
361         pipelines[i] = NULL;
362       }
363     }
364
365     OptixModuleCompileOptions module_options = {};
366     module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
367 #  ifdef WITH_CYCLES_DEBUG
368     module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
369     module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
370 #  else
371     module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
372     module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
373 #  endif
374
375 #  if OPTIX_ABI_VERSION >= 41
376     module_options.boundValues = nullptr;
377     module_options.numBoundValues = 0;
378 #  endif
379
380     OptixPipelineCompileOptions pipeline_options = {};
381     // Default to no motion blur and two-level graph, since it is the fastest option
382     pipeline_options.usesMotionBlur = false;
383     pipeline_options.traversableGraphFlags =
384         OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
385     pipeline_options.numPayloadValues = 6;
386     pipeline_options.numAttributeValues = 2;  // u, v
387     pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
388     pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
389
390 #  if OPTIX_ABI_VERSION >= 36
391     pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
392     if (requested_features.use_hair) {
393       if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
394         pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
395       }
396       else {
397         pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
398       }
399     }
400 #  endif
401
402     // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
403     // This is necessary since objects may be reported to have motion if the Vector pass is
404     // active, but may still need to be rendered without motion blur if that isn't active as well
405     motion_blur = requested_features.use_object_motion;
406
407     if (motion_blur) {
408       pipeline_options.usesMotionBlur = true;
409       // Motion blur can insert motion transforms into the traversal graph
410       // It is no longer a two-level graph then, so need to set flags to allow any configuration
411       pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
412     }
413
414     {  // Load and compile PTX module with OptiX kernels
415       string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
416                                                    "lib/kernel_optix_shader_raytrace.ptx" :
417                                                    "lib/kernel_optix.ptx");
418       if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
419         if (!getenv("OPTIX_ROOT_DIR")) {
420           set_error(
421               "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
422               "the Optix SDK to be able to compile Optix kernels on demand).");
423           return false;
424         }
425         ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
426       }
427       if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
428         set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
429         return false;
430       }
431
432       check_result_optix_ret(optixModuleCreateFromPTX(context,
433                                                       &module_options,
434                                                       &pipeline_options,
435                                                       ptx_data.data(),
436                                                       ptx_data.size(),
437                                                       nullptr,
438                                                       0,
439                                                       &optix_module));
440     }
441
442     // Create program groups
443     OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
444     OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
445     OptixProgramGroupOptions group_options = {};  // There are no options currently
446     group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
447     group_descs[PG_RGEN].raygen.module = optix_module;
448     // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
449     group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
450     group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
451     group_descs[PG_MISS].miss.module = optix_module;
452     group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
453     group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
454     group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
455     group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
456     group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
457     group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
458     group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
459     group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
460     group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
461
462     if (requested_features.use_hair) {
463       group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
464       group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
465
466       // Add curve intersection programs
467       if (requested_features.use_hair_thick) {
468         // Slower programs for thick hair since that also slows down ribbons.
469         // Ideally this should not be needed.
470         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
471         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
472       }
473       else {
474         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
475         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
476       }
477
478 #  if OPTIX_ABI_VERSION >= 36
479       if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
480         OptixBuiltinISOptions builtin_options = {};
481         builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
482         builtin_options.usesMotionBlur = false;
483
484         check_result_optix_ret(optixBuiltinISModuleGet(
485             context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
486
487         group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
488         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
489         group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
490         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
491
492         if (motion_blur) {
493           builtin_options.usesMotionBlur = true;
494
495           check_result_optix_ret(optixBuiltinISModuleGet(
496               context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
497
498           group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
499           group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
500           group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
501           group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
502         }
503       }
504 #  endif
505     }
506
507     if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
508       // Add hit group for local intersections
509       group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
510       group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
511       group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
512     }
513
514     if (requested_features.use_baking) {
515       group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
516       group_descs[PG_BAKE].raygen.module = optix_module;
517       group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
518     }
519
520     if (requested_features.use_true_displacement) {
521       group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
522       group_descs[PG_DISP].raygen.module = optix_module;
523       group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
524     }
525
526     if (requested_features.use_background_light) {
527       group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
528       group_descs[PG_BACK].raygen.module = optix_module;
529       group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
530     }
531
532     // Shader raytracing replaces some functions with direct callables
533     if (requested_features.use_shader_raytrace) {
534       group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
535       group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
536       group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
537       group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
538       group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
539       group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
540           "__direct_callable__kernel_volume_shadow";
541       group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
542       group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
543       group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
544           "__direct_callable__subsurface_scatter_multi_setup";
545     }
546
547     check_result_optix_ret(optixProgramGroupCreate(
548         context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
549
550     // Get program stack sizes
551     OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
552     // Set up SBT, which in this case is used only to select between different programs
553     sbt_data.alloc(NUM_PROGRAM_GROUPS);
554     memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
555     for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
556       check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
557       check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
558     }
559     sbt_data.copy_to_device();  // Upload SBT to device
560
561     // Calculate maximum trace continuation stack size
562     unsigned int trace_css = stack_size[PG_HITD].cssCH;
563     // This is based on the maximum of closest-hit and any-hit/intersection programs
564     trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
565     trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
566     trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
567 #  if OPTIX_ABI_VERSION >= 36
568     trace_css = std::max(trace_css,
569                          stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
570     trace_css = std::max(trace_css,
571                          stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
572 #  endif
573
574     OptixPipelineLinkOptions link_options = {};
575     link_options.maxTraceDepth = 1;
576 #  ifdef WITH_CYCLES_DEBUG
577     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
578 #  else
579     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
580 #  endif
581 #  if OPTIX_ABI_VERSION < 24
582     link_options.overrideUsesMotionBlur = motion_blur;
583 #  endif
584
585     {  // Create path tracing pipeline
586       vector<OptixProgramGroup> pipeline_groups;
587       pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
588       pipeline_groups.push_back(groups[PG_RGEN]);
589       pipeline_groups.push_back(groups[PG_MISS]);
590       pipeline_groups.push_back(groups[PG_HITD]);
591       pipeline_groups.push_back(groups[PG_HITS]);
592       pipeline_groups.push_back(groups[PG_HITL]);
593 #  if OPTIX_ABI_VERSION >= 36
594       if (motion_blur) {
595         pipeline_groups.push_back(groups[PG_HITD_MOTION]);
596         pipeline_groups.push_back(groups[PG_HITS_MOTION]);
597       }
598 #  endif
599       if (requested_features.use_shader_raytrace) {
600         pipeline_groups.push_back(groups[PG_CALL + 0]);
601         pipeline_groups.push_back(groups[PG_CALL + 1]);
602         pipeline_groups.push_back(groups[PG_CALL + 2]);
603       }
604
605       check_result_optix_ret(optixPipelineCreate(context,
606                                                  &pipeline_options,
607                                                  &link_options,
608                                                  pipeline_groups.data(),
609                                                  pipeline_groups.size(),
610                                                  nullptr,
611                                                  0,
612                                                  &pipelines[PIP_PATH_TRACE]));
613
614       // Combine ray generation and trace continuation stack size
615       const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
616       // Max direct callable depth is one of the following, so combine accordingly
617       // - __raygen__ -> svm_eval_nodes
618       // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
619       // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
620       const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
621                                std::max(stack_size[PG_CALL + 1].dssDC,
622                                         stack_size[PG_CALL + 2].dssDC);
623
624       // Set stack size depending on pipeline options
625       check_result_optix_ret(
626           optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
627                                     0,
628                                     requested_features.use_shader_raytrace ? dss : 0,
629                                     css,
630                                     motion_blur ? 3 : 2));
631     }
632
633     // Only need to create shader evaluation pipeline if one of these features is used:
634     const bool use_shader_eval_pipeline = requested_features.use_baking ||
635                                           requested_features.use_background_light ||
636                                           requested_features.use_true_displacement;
637
638     if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
639       vector<OptixProgramGroup> pipeline_groups;
640       pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
641       pipeline_groups.push_back(groups[PG_BAKE]);
642       pipeline_groups.push_back(groups[PG_DISP]);
643       pipeline_groups.push_back(groups[PG_BACK]);
644       pipeline_groups.push_back(groups[PG_MISS]);
645       pipeline_groups.push_back(groups[PG_HITD]);
646       pipeline_groups.push_back(groups[PG_HITS]);
647       pipeline_groups.push_back(groups[PG_HITL]);
648 #  if OPTIX_ABI_VERSION >= 36
649       if (motion_blur) {
650         pipeline_groups.push_back(groups[PG_HITD_MOTION]);
651         pipeline_groups.push_back(groups[PG_HITS_MOTION]);
652       }
653 #  endif
654       if (requested_features.use_shader_raytrace) {
655         pipeline_groups.push_back(groups[PG_CALL + 0]);
656         pipeline_groups.push_back(groups[PG_CALL + 1]);
657         pipeline_groups.push_back(groups[PG_CALL + 2]);
658       }
659
660       check_result_optix_ret(optixPipelineCreate(context,
661                                                  &pipeline_options,
662                                                  &link_options,
663                                                  pipeline_groups.data(),
664                                                  pipeline_groups.size(),
665                                                  nullptr,
666                                                  0,
667                                                  &pipelines[PIP_SHADER_EVAL]));
668
669       // Calculate continuation stack size based on the maximum of all ray generation stack sizes
670       const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
671                                         std::max(stack_size[PG_DISP].cssRG,
672                                                  stack_size[PG_BACK].cssRG)) +
673                                link_options.maxTraceDepth * trace_css;
674       const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
675                                std::max(stack_size[PG_CALL + 1].dssDC,
676                                         stack_size[PG_CALL + 2].dssDC);
677
678       check_result_optix_ret(
679           optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
680                                     0,
681                                     requested_features.use_shader_raytrace ? dss : 0,
682                                     css,
683                                     motion_blur ? 3 : 2));
684     }
685
686     // Clean up program group objects
687     for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
688       optixProgramGroupDestroy(groups[i]);
689     }
690
691     return true;
692   }
693
694   void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
695   {
696     if (have_error())
697       return;  // Abort early if there was an error previously
698
699     if (task.type == DeviceTask::RENDER) {
700       if (thread_index != 0) {
701         // Only execute denoising in a single thread (see also 'task_add')
702         task.tile_types &= ~RenderTile::DENOISE;
703       }
704
705       RenderTile tile;
706       while (task.acquire_tile(this, tile, task.tile_types)) {
707         if (tile.task == RenderTile::PATH_TRACE)
708           launch_render(task, tile, thread_index);
709         else if (tile.task == RenderTile::BAKE) {
710           // Perform baking using CUDA, since it is not currently implemented in OptiX
711           device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
712           CUDADevice::render(task, tile, work_tiles);
713         }
714         else if (tile.task == RenderTile::DENOISE)
715           launch_denoise(task, tile);
716         task.release_tile(tile);
717         if (task.get_cancel() && !task.need_finish_queue)
718           break;  // User requested cancellation
719         else if (have_error())
720           break;  // Abort rendering when encountering an error
721       }
722     }
723     else if (task.type == DeviceTask::SHADER) {
724       launch_shader_eval(task, thread_index);
725     }
726     else if (task.type == DeviceTask::DENOISE_BUFFER) {
727       // Set up a single tile that covers the whole task and denoise it
728       RenderTile tile;
729       tile.x = task.x;
730       tile.y = task.y;
731       tile.w = task.w;
732       tile.h = task.h;
733       tile.buffer = task.buffer;
734       tile.num_samples = task.num_samples;
735       tile.start_sample = task.sample;
736       tile.offset = task.offset;
737       tile.stride = task.stride;
738       tile.buffers = task.buffers;
739
740       launch_denoise(task, tile);
741     }
742   }
743
744   void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
745   {
746     assert(thread_index < launch_params.data_size);
747
748     // Keep track of total render time of this tile
749     const scoped_timer timer(&rtile.buffers->render_time);
750
751     WorkTile wtile;
752     wtile.x = rtile.x;
753     wtile.y = rtile.y;
754     wtile.w = rtile.w;
755     wtile.h = rtile.h;
756     wtile.offset = rtile.offset;
757     wtile.stride = rtile.stride;
758     wtile.buffer = (float *)rtile.buffer;
759
760     const int end_sample = rtile.start_sample + rtile.num_samples;
761     // Keep this number reasonable to avoid running into TDRs
762     int step_samples = (info.display_device ? 8 : 32);
763
764     // Offset into launch params buffer so that streams use separate data
765     device_ptr launch_params_ptr = launch_params.device_pointer +
766                                    thread_index * launch_params.data_elements;
767
768     const CUDAContextScope scope(cuContext);
769
770     for (int sample = rtile.start_sample; sample < end_sample;) {
771       // Copy work tile information to device
772       wtile.start_sample = sample;
773       wtile.num_samples = step_samples;
774       if (task.adaptive_sampling.use) {
775         wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
776       }
777       wtile.num_samples = min(wtile.num_samples, end_sample - sample);
778       device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
779       check_result_cuda(
780           cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
781
782       OptixShaderBindingTable sbt_params = {};
783       sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
784       sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
785       sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
786       sbt_params.missRecordCount = 1;
787       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
788       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
789 #  if OPTIX_ABI_VERSION >= 36
790       sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
791 #  else
792       sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
793 #  endif
794       sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
795       sbt_params.callablesRecordCount = 3;
796       sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
797
798       // Launch the ray generation program
799       check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
800                                      cuda_stream[thread_index],
801                                      launch_params_ptr,
802                                      launch_params.data_elements,
803                                      &sbt_params,
804                                      // Launch with samples close to each other for better locality
805                                      wtile.w * wtile.num_samples,
806                                      wtile.h,
807                                      1));
808
809       // Run the adaptive sampling kernels at selected samples aligned to step samples.
810       uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
811       if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
812         adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
813       }
814
815       // Wait for launch to finish
816       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
817
818       // Update current sample, so it is displayed correctly
819       sample += wtile.num_samples;
820       rtile.sample = sample;
821       // Update task progress after the kernel completed rendering
822       task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
823
824       if (task.get_cancel() && !task.need_finish_queue)
825         return;  // Cancel rendering
826     }
827
828     // Finalize adaptive sampling
829     if (task.adaptive_sampling.use) {
830       device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
831       adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
832       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
833       task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
834     }
835   }
836
837   bool launch_denoise(DeviceTask &task, RenderTile &rtile)
838   {
839     // Update current sample (for display and NLM denoising task)
840     rtile.sample = rtile.start_sample + rtile.num_samples;
841
842     // Make CUDA context current now, since it is used for both denoising tasks
843     const CUDAContextScope scope(cuContext);
844
845     // Choose between OptiX and NLM denoising
846     if (task.denoising.type == DENOISER_OPTIX) {
847       // Map neighboring tiles onto this device, indices are as following:
848       // Where index 4 is the center tile and index 9 is the target for the result.
849       //   0 1 2
850       //   3 4 5
851       //   6 7 8  9
852       RenderTileNeighbors neighbors(rtile);
853       task.map_neighbor_tiles(neighbors, this);
854       RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
855       RenderTile &target_tile = neighbors.target;
856       rtile = center_tile;  // Tile may have been modified by mapping code
857
858       // Calculate size of the tile to denoise (including overlap)
859       int4 rect = center_tile.bounds();
860       // Overlap between tiles has to be at least 64 pixels
861       // TODO(pmours): Query this value from OptiX
862       rect = rect_expand(rect, 64);
863       int4 clip_rect = neighbors.bounds();
864       rect = rect_clip(rect, clip_rect);
865       int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
866       int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
867
868       // Calculate byte offsets and strides
869       int pixel_stride = task.pass_stride * (int)sizeof(float);
870       int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
871       const int pass_offset[3] = {
872           (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
873           (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
874           (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
875
876       // Start with the current tile pointer offset
877       int input_stride = pixel_stride;
878       device_ptr input_ptr = rtile.buffer + pixel_offset;
879
880       // Copy tile data into a common buffer if necessary
881       device_only_memory<float> input(this, "denoiser input", true);
882       device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
883
884       bool contiguous_memory = true;
885       for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
886         if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
887           contiguous_memory = false;
888         }
889       }
890
891       if (contiguous_memory) {
892         // Tiles are in continous memory, so can just subtract overlap offset
893         input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
894         // Stride covers the whole width of the image and not just a single tile
895         input_stride *= rtile.stride;
896       }
897       else {
898         // Adjacent tiles are in separate memory regions, so need to copy them into a single one
899         input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
900         // Start with the new input buffer
901         input_ptr = input.device_pointer;
902         // Stride covers the width of the new input buffer, which includes tile width and overlap
903         input_stride *= rect_size.x;
904
905         TileInfo *tile_info = tile_info_mem.alloc(1);
906         for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
907           tile_info->offsets[i] = neighbors.tiles[i].offset;
908           tile_info->strides[i] = neighbors.tiles[i].stride;
909           tile_info->buffers[i] = neighbors.tiles[i].buffer;
910         }
911         tile_info->x[0] = neighbors.tiles[3].x;
912         tile_info->x[1] = neighbors.tiles[4].x;
913         tile_info->x[2] = neighbors.tiles[5].x;
914         tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
915         tile_info->y[0] = neighbors.tiles[1].y;
916         tile_info->y[1] = neighbors.tiles[4].y;
917         tile_info->y[2] = neighbors.tiles[7].y;
918         tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
919         tile_info_mem.copy_to_device();
920
921         void *args[] = {
922             &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
923         launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
924       }
925
926 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
927       device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
928       input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
929
930       void *input_args[] = {&input_rgb.device_pointer,
931                             &input_ptr,
932                             &rect_size.x,
933                             &rect_size.y,
934                             &input_stride,
935                             &task.pass_stride,
936                             const_cast<int *>(pass_offset),
937                             &task.denoising.input_passes,
938                             &rtile.sample};
939       launch_filter_kernel(
940           "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
941
942       input_ptr = input_rgb.device_pointer;
943       pixel_stride = 3 * sizeof(float);
944       input_stride = rect_size.x * pixel_stride;
945 #  endif
946
947       const bool recreate_denoiser = (denoiser == NULL) ||
948                                      (task.denoising.input_passes != denoiser_input_passes);
949       if (recreate_denoiser) {
950         // Destroy existing handle before creating new one
951         if (denoiser != NULL) {
952           optixDenoiserDestroy(denoiser);
953         }
954
955         // Create OptiX denoiser handle on demand when it is first used
956         OptixDenoiserOptions denoiser_options = {};
957         assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
958 #  if OPTIX_ABI_VERSION >= 47
959         denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
960         denoiser_options.guideNormal = task.denoising.input_passes >= 3;
961         check_result_optix_ret(optixDenoiserCreate(
962             context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
963 #  else
964         denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
965             OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
966 #    if OPTIX_ABI_VERSION < 28
967         denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
968 #    endif
969         check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
970         check_result_optix_ret(
971             optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
972 #  endif
973
974         // OptiX denoiser handle was created with the requested number of input passes
975         denoiser_input_passes = task.denoising.input_passes;
976       }
977
978       OptixDenoiserSizes sizes = {};
979       check_result_optix_ret(
980           optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
981
982 #  if OPTIX_ABI_VERSION < 28
983       const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
984 #  else
985       const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
986 #  endif
987       const size_t scratch_offset = sizes.stateSizeInBytes;
988
989       // Allocate denoiser state if tile size has changed since last setup
990       if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
991                                 denoiser_state.data_height != rect_size.y)) {
992         denoiser_state.alloc_to_device(scratch_offset + scratch_size);
993
994         // Initialize denoiser state for the current tile size
995         check_result_optix_ret(optixDenoiserSetup(denoiser,
996                                                   0,
997                                                   rect_size.x,
998                                                   rect_size.y,
999                                                   denoiser_state.device_pointer,
1000                                                   scratch_offset,
1001                                                   denoiser_state.device_pointer + scratch_offset,
1002                                                   scratch_size));
1003
1004         denoiser_state.data_width = rect_size.x;
1005         denoiser_state.data_height = rect_size.y;
1006       }
1007
1008       // Set up input and output layer information
1009       OptixImage2D input_layers[3] = {};
1010       OptixImage2D output_layers[1] = {};
1011
1012       for (int i = 0; i < 3; ++i) {
1013 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
1014         input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
1015 #  else
1016         input_layers[i].data = input_ptr + pass_offset[i];
1017 #  endif
1018         input_layers[i].width = rect_size.x;
1019         input_layers[i].height = rect_size.y;
1020         input_layers[i].rowStrideInBytes = input_stride;
1021         input_layers[i].pixelStrideInBytes = pixel_stride;
1022         input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
1023       }
1024
1025 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
1026       output_layers[0].data = input_ptr;
1027       output_layers[0].width = rect_size.x;
1028       output_layers[0].height = rect_size.y;
1029       output_layers[0].rowStrideInBytes = input_stride;
1030       output_layers[0].pixelStrideInBytes = pixel_stride;
1031       int2 output_offset = overlap_offset;
1032       overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
1033 #  else
1034       output_layers[0].data = target_tile.buffer + pixel_offset;
1035       output_layers[0].width = target_tile.w;
1036       output_layers[0].height = target_tile.h;
1037       output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
1038       output_layers[0].pixelStrideInBytes = pixel_stride;
1039 #  endif
1040       output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
1041
1042 #  if OPTIX_ABI_VERSION >= 47
1043       OptixDenoiserLayer image_layers = {};
1044       image_layers.input = input_layers[0];
1045       image_layers.output = output_layers[0];
1046
1047       OptixDenoiserGuideLayer guide_layers = {};
1048       guide_layers.albedo = input_layers[1];
1049       guide_layers.normal = input_layers[2];
1050 #  endif
1051
1052       // Finally run denonising
1053       OptixDenoiserParams params = {};  // All parameters are disabled/zero
1054 #  if OPTIX_ABI_VERSION >= 47
1055       check_result_optix_ret(optixDenoiserInvoke(denoiser,
1056                                                  NULL,
1057                                                  &params,
1058                                                  denoiser_state.device_pointer,
1059                                                  scratch_offset,
1060                                                  &guide_layers,
1061                                                  &image_layers,
1062                                                  1,
1063                                                  overlap_offset.x,
1064                                                  overlap_offset.y,
1065                                                  denoiser_state.device_pointer + scratch_offset,
1066                                                  scratch_size));
1067 #  else
1068       check_result_optix_ret(optixDenoiserInvoke(denoiser,
1069                                                  NULL,
1070                                                  &params,
1071                                                  denoiser_state.device_pointer,
1072                                                  scratch_offset,
1073                                                  input_layers,
1074                                                  task.denoising.input_passes,
1075                                                  overlap_offset.x,
1076                                                  overlap_offset.y,
1077                                                  output_layers,
1078                                                  denoiser_state.device_pointer + scratch_offset,
1079                                                  scratch_size));
1080 #  endif
1081
1082 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
1083       void *output_args[] = {&input_ptr,
1084                              &target_tile.buffer,
1085                              &output_offset.x,
1086                              &output_offset.y,
1087                              &rect_size.x,
1088                              &rect_size.y,
1089                              &target_tile.x,
1090                              &target_tile.y,
1091                              &target_tile.w,
1092                              &target_tile.h,
1093                              &target_tile.offset,
1094                              &target_tile.stride,
1095                              &task.pass_stride,
1096                              &rtile.sample};
1097       launch_filter_kernel(
1098           "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
1099 #  endif
1100
1101       check_result_cuda_ret(cuStreamSynchronize(0));
1102
1103       task.unmap_neighbor_tiles(neighbors, this);
1104     }
1105     else {
1106       // Run CUDA denoising kernels
1107       DenoisingTask denoising(this, task);
1108       CUDADevice::denoise(rtile, denoising);
1109     }
1110
1111     // Update task progress after the denoiser completed processing
1112     task.update_progress(&rtile, rtile.w * rtile.h);
1113
1114     return true;
1115   }
1116
1117   void launch_shader_eval(DeviceTask &task, int thread_index)
1118   {
1119     unsigned int rgen_index = PG_BACK;
1120     if (task.shader_eval_type >= SHADER_EVAL_BAKE)
1121       rgen_index = PG_BAKE;
1122     if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
1123       rgen_index = PG_DISP;
1124
1125     const CUDAContextScope scope(cuContext);
1126
1127     device_ptr launch_params_ptr = launch_params.device_pointer +
1128                                    thread_index * launch_params.data_elements;
1129
1130     for (int sample = 0; sample < task.num_samples; ++sample) {
1131       ShaderParams params;
1132       params.input = (uint4 *)task.shader_input;
1133       params.output = (float4 *)task.shader_output;
1134       params.type = task.shader_eval_type;
1135       params.filter = task.shader_filter;
1136       params.sx = task.shader_x;
1137       params.offset = task.offset;
1138       params.sample = sample;
1139
1140       check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
1141                                           &params,
1142                                           sizeof(params),
1143                                           cuda_stream[thread_index]));
1144
1145       OptixShaderBindingTable sbt_params = {};
1146       sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
1147       sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
1148       sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
1149       sbt_params.missRecordCount = 1;
1150       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
1151       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
1152 #  if OPTIX_ABI_VERSION >= 36
1153       sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
1154 #  else
1155       sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
1156 #  endif
1157       sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
1158       sbt_params.callablesRecordCount = 3;
1159       sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
1160
1161       check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
1162                                      cuda_stream[thread_index],
1163                                      launch_params_ptr,
1164                                      launch_params.data_elements,
1165                                      &sbt_params,
1166                                      task.shader_w,
1167                                      1,
1168                                      1));
1169
1170       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
1171
1172       task.update_progress(NULL);
1173     }
1174   }
1175
1176   bool build_optix_bvh(BVHOptiX *bvh,
1177                        OptixBuildOperation operation,
1178                        const OptixBuildInput &build_input,
1179                        uint16_t num_motion_steps)
1180   {
1181     /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
1182      * from running out of memory (since both original and compacted acceleration structure memory
1183      * may be allocated at the same time for the duration of this function). The builds would
1184      * otherwise happen on the same CUDA stream anyway. */
1185     static thread_mutex mutex;
1186     thread_scoped_lock lock(mutex);
1187
1188     const CUDAContextScope scope(cuContext);
1189
1190     // Compute memory usage
1191     OptixAccelBufferSizes sizes = {};
1192     OptixAccelBuildOptions options = {};
1193     options.operation = operation;
1194     if (background) {
1195       // Prefer best performance and lowest memory consumption in background
1196       options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
1197     }
1198     else {
1199       // Prefer fast updates in viewport
1200       options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1201     }
1202
1203     options.motionOptions.numKeys = num_motion_steps;
1204     options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
1205     options.motionOptions.timeBegin = 0.0f;
1206     options.motionOptions.timeEnd = 1.0f;
1207
1208     check_result_optix_ret(
1209         optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
1210
1211     // Allocate required output buffers
1212     device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
1213     temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
1214     if (!temp_mem.device_pointer)
1215       return false;  // Make sure temporary memory allocation succeeded
1216
1217     // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
1218     device_only_memory<char> &out_data = bvh->as_data;
1219     if (operation == OPTIX_BUILD_OPERATION_BUILD) {
1220       assert(out_data.device == this);
1221       out_data.alloc_to_device(sizes.outputSizeInBytes);
1222       if (!out_data.device_pointer)
1223         return false;
1224     }
1225     else {
1226       assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
1227     }
1228
1229     // Finally build the acceleration structure
1230     OptixAccelEmitDesc compacted_size_prop = {};
1231     compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
1232     // A tiny space was allocated for this property at the end of the temporary buffer above
1233     // Make sure this pointer is 8-byte aligned
1234     compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
1235
1236     OptixTraversableHandle out_handle = 0;
1237     check_result_optix_ret(optixAccelBuild(context,
1238                                            NULL,
1239                                            &options,
1240                                            &build_input,
1241                                            1,
1242                                            temp_mem.device_pointer,
1243                                            sizes.tempSizeInBytes,
1244                                            out_data.device_pointer,
1245                                            sizes.outputSizeInBytes,
1246                                            &out_handle,
1247                                            background ? &compacted_size_prop : NULL,
1248                                            background ? 1 : 0));
1249     bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1250
1251     // Wait for all operations to finish
1252     check_result_cuda_ret(cuStreamSynchronize(NULL));
1253
1254     // Compact acceleration structure to save memory (do not do this in viewport for faster builds)
1255     if (background) {
1256       uint64_t compacted_size = sizes.outputSizeInBytes;
1257       check_result_cuda_ret(
1258           cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
1259
1260       // Temporary memory is no longer needed, so free it now to make space
1261       temp_mem.free();
1262
1263       // There is no point compacting if the size does not change
1264       if (compacted_size < sizes.outputSizeInBytes) {
1265         device_only_memory<char> compacted_data(this, "optix compacted as", false);
1266         compacted_data.alloc_to_device(compacted_size);
1267         if (!compacted_data.device_pointer)
1268           // Do not compact if memory allocation for compacted acceleration structure fails
1269           // Can just use the uncompacted one then, so succeed here regardless
1270           return true;
1271
1272         check_result_optix_ret(optixAccelCompact(context,
1273                                                  NULL,
1274                                                  out_handle,
1275                                                  compacted_data.device_pointer,
1276                                                  compacted_size,
1277                                                  &out_handle));
1278         bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1279
1280         // Wait for compaction to finish
1281         check_result_cuda_ret(cuStreamSynchronize(NULL));
1282
1283         std::swap(out_data.device_size, compacted_data.device_size);
1284         std::swap(out_data.device_pointer, compacted_data.device_pointer);
1285         // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
1286       }
1287     }
1288
1289     return true;
1290   }
1291
1292   void build_bvh(BVH *bvh, Progress &progress, bool refit) override
1293   {
1294     if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
1295       /* For baking CUDA is used, build appropriate BVH for that. */
1296       Device::build_bvh(bvh, progress, refit);
1297       return;
1298     }
1299
1300     BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1301
1302     progress.set_substatus("Building OptiX acceleration structure");
1303
1304     if (!bvh->params.top_level) {
1305       assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
1306
1307       // Refit is only possible in viewport for now (because AS is built with
1308       // OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above)
1309       OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
1310       if (refit && !background) {
1311         assert(bvh_optix->traversable_handle != 0);
1312         operation = OPTIX_BUILD_OPERATION_UPDATE;
1313       }
1314       else {
1315         bvh_optix->as_data.free();
1316         bvh_optix->traversable_handle = 0;
1317       }
1318
1319       // Build bottom level acceleration structures (BLAS)
1320       Geometry *const geom = bvh->geometry[0];
1321       if (geom->geometry_type == Geometry::HAIR) {
1322         // Build BLAS for curve primitives
1323         Hair *const hair = static_cast<Hair *const>(geom);
1324         if (hair->num_curves() == 0) {
1325           return;
1326         }
1327
1328         const size_t num_segments = hair->num_segments();
1329
1330         size_t num_motion_steps = 1;
1331         Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1332         if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
1333           num_motion_steps = hair->get_motion_steps();
1334         }
1335
1336         device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1337 #  if OPTIX_ABI_VERSION >= 36
1338         device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1339         device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1340         // Four control points for each curve segment
1341         const size_t num_vertices = num_segments * 4;
1342         if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1343           index_data.alloc(num_segments);
1344           vertex_data.alloc(num_vertices * num_motion_steps);
1345         }
1346         else
1347 #  endif
1348           aabb_data.alloc(num_segments * num_motion_steps);
1349
1350         // Get AABBs for each motion step
1351         for (size_t step = 0; step < num_motion_steps; ++step) {
1352           // The center step for motion vertices is not stored in the attribute
1353           const float3 *keys = hair->get_curve_keys().data();
1354           size_t center_step = (num_motion_steps - 1) / 2;
1355           if (step != center_step) {
1356             size_t attr_offset = (step > center_step) ? step - 1 : step;
1357             // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
1358             keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
1359           }
1360
1361           for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
1362             const Hair::Curve curve = hair->get_curve(j);
1363 #  if OPTIX_ABI_VERSION >= 36
1364             const array<float> &curve_radius = hair->get_curve_radius();
1365 #  endif
1366
1367             for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
1368 #  if OPTIX_ABI_VERSION >= 36
1369               if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1370                 int k0 = curve.first_key + segment;
1371                 int k1 = k0 + 1;
1372                 int ka = max(k0 - 1, curve.first_key);
1373                 int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
1374
1375                 const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
1376                 const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
1377                 const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
1378                 const float4 pw = make_float4(
1379                     curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
1380
1381                 // Convert Catmull-Rom data to Bezier spline
1382                 static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
1383                 static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
1384                 static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
1385                 static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
1386
1387                 index_data[i] = i * 4;
1388                 float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
1389                 v[0] = make_float4(
1390                     dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
1391                 v[1] = make_float4(
1392                     dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
1393                 v[2] = make_float4(
1394                     dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
1395                 v[3] = make_float4(
1396                     dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
1397               }
1398               else
1399 #  endif
1400               {
1401                 BoundBox bounds = BoundBox::empty;
1402                 curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
1403
1404                 const size_t index = step * num_segments + i;
1405                 aabb_data[index].minX = bounds.min.x;
1406                 aabb_data[index].minY = bounds.min.y;
1407                 aabb_data[index].minZ = bounds.min.z;
1408                 aabb_data[index].maxX = bounds.max.x;
1409                 aabb_data[index].maxY = bounds.max.y;
1410                 aabb_data[index].maxZ = bounds.max.z;
1411               }
1412             }
1413           }
1414         }
1415
1416         // Upload AABB data to GPU
1417         aabb_data.copy_to_device();
1418 #  if OPTIX_ABI_VERSION >= 36
1419         index_data.copy_to_device();
1420         vertex_data.copy_to_device();
1421 #  endif
1422
1423         vector<device_ptr> aabb_ptrs;
1424         aabb_ptrs.reserve(num_motion_steps);
1425 #  if OPTIX_ABI_VERSION >= 36
1426         vector<device_ptr> width_ptrs;
1427         vector<device_ptr> vertex_ptrs;
1428         width_ptrs.reserve(num_motion_steps);
1429         vertex_ptrs.reserve(num_motion_steps);
1430 #  endif
1431         for (size_t step = 0; step < num_motion_steps; ++step) {
1432           aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
1433 #  if OPTIX_ABI_VERSION >= 36
1434           const device_ptr base_ptr = vertex_data.device_pointer +
1435                                       step * num_vertices * sizeof(float4);
1436           width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
1437           vertex_ptrs.push_back(base_ptr);
1438 #  endif
1439         }
1440
1441         // Force a single any-hit call, so shadow record-all behavior works correctly
1442         unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1443         OptixBuildInput build_input = {};
1444 #  if OPTIX_ABI_VERSION >= 36
1445         if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1446           build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
1447           build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
1448           build_input.curveArray.numPrimitives = num_segments;
1449           build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1450           build_input.curveArray.numVertices = num_vertices;
1451           build_input.curveArray.vertexStrideInBytes = sizeof(float4);
1452           build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
1453           build_input.curveArray.widthStrideInBytes = sizeof(float4);
1454           build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
1455           build_input.curveArray.indexStrideInBytes = sizeof(int);
1456           build_input.curveArray.flag = build_flags;
1457           build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
1458         }
1459         else
1460 #  endif
1461         {
1462           // Disable visibility test any-hit program, since it is already checked during
1463           // intersection. Those trace calls that require anyhit can force it with a ray flag.
1464           build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
1465
1466           build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1467 #  if OPTIX_ABI_VERSION < 23
1468           build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1469           build_input.aabbArray.numPrimitives = num_segments;
1470           build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
1471           build_input.aabbArray.flags = &build_flags;
1472           build_input.aabbArray.numSbtRecords = 1;
1473           build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
1474 #  else
1475           build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1476           build_input.customPrimitiveArray.numPrimitives = num_segments;
1477           build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1478           build_input.customPrimitiveArray.flags = &build_flags;
1479           build_input.customPrimitiveArray.numSbtRecords = 1;
1480           build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
1481 #  endif
1482         }
1483
1484         if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1485           progress.set_error("Failed to build OptiX acceleration structure");
1486         }
1487       }
1488       else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
1489         // Build BLAS for triangle primitives
1490         Mesh *const mesh = static_cast<Mesh *const>(geom);
1491         if (mesh->num_triangles() == 0) {
1492           return;
1493         }
1494
1495         const size_t num_verts = mesh->get_verts().size();
1496
1497         size_t num_motion_steps = 1;
1498         Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1499         if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
1500           num_motion_steps = mesh->get_motion_steps();
1501         }
1502
1503         device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1504         index_data.alloc(mesh->get_triangles().size());
1505         memcpy(index_data.data(),
1506                mesh->get_triangles().data(),
1507                mesh->get_triangles().size() * sizeof(int));
1508         device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1509         vertex_data.alloc(num_verts * num_motion_steps);
1510
1511         for (size_t step = 0; step < num_motion_steps; ++step) {
1512           const float3 *verts = mesh->get_verts().data();
1513
1514           size_t center_step = (num_motion_steps - 1) / 2;
1515           // The center step for motion vertices is not stored in the attribute
1516           if (step != center_step) {
1517             verts = motion_keys->data_float3() +
1518                     (step > center_step ? step - 1 : step) * num_verts;
1519           }
1520
1521           memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
1522         }
1523
1524         // Upload triangle data to GPU
1525         index_data.copy_to_device();
1526         vertex_data.copy_to_device();
1527
1528         vector<device_ptr> vertex_ptrs;
1529         vertex_ptrs.reserve(num_motion_steps);
1530         for (size_t step = 0; step < num_motion_steps; ++step) {
1531           vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
1532         }
1533
1534         // Force a single any-hit call, so shadow record-all behavior works correctly
1535         unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1536         OptixBuildInput build_input = {};
1537         build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
1538         build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1539         build_input.triangleArray.numVertices = num_verts;
1540         build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
1541         build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
1542         build_input.triangleArray.indexBuffer = index_data.device_pointer;
1543         build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
1544         build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
1545         build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
1546         build_input.triangleArray.flags = &build_flags;
1547         // The SBT does not store per primitive data since Cycles already allocates separate
1548         // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
1549         // one and rely on that having the same meaning in this case.
1550         build_input.triangleArray.numSbtRecords = 1;
1551         build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
1552
1553         if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1554           progress.set_error("Failed to build OptiX acceleration structure");
1555         }
1556       }
1557     }
1558     else {
1559       unsigned int num_instances = 0;
1560       unsigned int max_num_instances = 0xFFFFFFFF;
1561
1562       bvh_optix->as_data.free();
1563       bvh_optix->traversable_handle = 0;
1564       bvh_optix->motion_transform_data.free();
1565
1566       optixDeviceContextGetProperty(context,
1567                                     OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
1568                                     &max_num_instances,
1569                                     sizeof(max_num_instances));
1570       // Do not count first bit, which is used to distinguish instanced and non-instanced objects
1571       max_num_instances >>= 1;
1572       if (bvh->objects.size() > max_num_instances) {
1573         progress.set_error(
1574             "Failed to build OptiX acceleration structure because there are too many instances");
1575         return;
1576       }
1577
1578       // Fill instance descriptions
1579 #  if OPTIX_ABI_VERSION < 41
1580       device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
1581       aabbs.alloc(bvh->objects.size());
1582 #  endif
1583       device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
1584       instances.alloc(bvh->objects.size());
1585
1586       // Calculate total motion transform size and allocate memory for them
1587       size_t motion_transform_offset = 0;
1588       if (motion_blur) {
1589         size_t total_motion_transform_size = 0;
1590         for (Object *const ob : bvh->objects) {
1591           if (ob->is_traceable() && ob->use_motion()) {
1592             total_motion_transform_size = align_up(total_motion_transform_size,
1593                                                    OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1594             const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
1595             total_motion_transform_size = total_motion_transform_size +
1596                                           sizeof(OptixSRTMotionTransform) +
1597                                           motion_keys * sizeof(OptixSRTData);
1598           }
1599         }
1600
1601         assert(bvh_optix->motion_transform_data.device == this);
1602         bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
1603       }
1604
1605       for (Object *ob : bvh->objects) {
1606         // Skip non-traceable objects
1607         if (!ob->is_traceable())
1608           continue;
1609
1610         BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
1611         OptixTraversableHandle handle = blas->traversable_handle;
1612
1613 #  if OPTIX_ABI_VERSION < 41
1614         OptixAabb &aabb = aabbs[num_instances];
1615         aabb.minX = ob->bounds.min.x;
1616         aabb.minY = ob->bounds.min.y;
1617         aabb.minZ = ob->bounds.min.z;
1618         aabb.maxX = ob->bounds.max.x;
1619         aabb.maxY = ob->bounds.max.y;
1620         aabb.maxZ = ob->bounds.max.z;
1621 #  endif
1622
1623         OptixInstance &instance = instances[num_instances++];
1624         memset(&instance, 0, sizeof(instance));
1625
1626         // Clear transform to identity matrix
1627         instance.transform[0] = 1.0f;
1628         instance.transform[5] = 1.0f;
1629         instance.transform[10] = 1.0f;
1630
1631         // Set user instance ID to object index (but leave low bit blank)
1632         instance.instanceId = ob->get_device_index() << 1;
1633
1634         // Have to have at least one bit in the mask, or else instance would always be culled
1635         instance.visibilityMask = 1;
1636
1637         if (ob->get_geometry()->has_volume) {
1638           // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
1639           instance.visibilityMask |= 2;
1640         }
1641
1642         if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
1643           // Same applies to curves (so they can be skipped in local trace calls)
1644           instance.visibilityMask |= 4;
1645
1646 #  if OPTIX_ABI_VERSION >= 36
1647           if (motion_blur && ob->get_geometry()->has_motion_blur() &&
1648               DebugFlags().optix.curves_api &&
1649               static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
1650             // Select between motion blur and non-motion blur built-in intersection module
1651             instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
1652           }
1653 #  endif
1654         }
1655
1656         // Insert motion traversable if object has motion
1657         if (motion_blur && ob->use_motion()) {
1658           size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
1659           size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
1660                                          motion_keys * sizeof(OptixSRTData);
1661
1662           const CUDAContextScope scope(cuContext);
1663
1664           motion_transform_offset = align_up(motion_transform_offset,
1665                                              OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1666           CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
1667                                              motion_transform_offset;
1668           motion_transform_offset += motion_transform_size;
1669
1670           // Allocate host side memory for motion transform and fill it with transform data
1671           OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
1672               new uint8_t[motion_transform_size]);
1673           motion_transform.child = handle;
1674           motion_transform.motionOptions.numKeys = ob->get_motion().size();
1675           motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
1676           motion_transform.motionOptions.timeBegin = 0.0f;
1677           motion_transform.motionOptions.timeEnd = 1.0f;
1678
1679           OptixSRTData *const srt_data = motion_transform.srtData;
1680           array<DecomposedTransform> decomp(ob->get_motion().size());
1681           transform_motion_decompose(
1682               decomp.data(), ob->get_motion().data(), ob->get_motion().size());
1683
1684           for (size_t i = 0; i < ob->get_motion().size(); ++i) {
1685             // Scale
1686             srt_data[i].sx = decomp[i].y.w;  // scale.x.x
1687             srt_data[i].sy = decomp[i].z.w;  // scale.y.y
1688             srt_data[i].sz = decomp[i].w.w;  // scale.z.z
1689
1690             // Shear
1691             srt_data[i].a = decomp[i].z.x;  // scale.x.y
1692             srt_data[i].b = decomp[i].z.y;  // scale.x.z
1693             srt_data[i].c = decomp[i].w.x;  // scale.y.z
1694             assert(decomp[i].z.z == 0.0f);  // scale.y.x
1695             assert(decomp[i].w.y == 0.0f);  // scale.z.x
1696             assert(decomp[i].w.z == 0.0f);  // scale.z.y
1697
1698             // Pivot point
1699             srt_data[i].pvx = 0.0f;
1700             srt_data[i].pvy = 0.0f;
1701             srt_data[i].pvz = 0.0f;
1702
1703             // Rotation
1704             srt_data[i].qx = decomp[i].x.x;
1705             srt_data[i].qy = decomp[i].x.y;
1706             srt_data[i].qz = decomp[i].x.z;
1707             srt_data[i].qw = decomp[i].x.w;
1708
1709             // Translation
1710             srt_data[i].tx = decomp[i].y.x;
1711             srt_data[i].ty = decomp[i].y.y;
1712             srt_data[i].tz = decomp[i].y.z;
1713           }
1714
1715           // Upload motion transform to GPU
1716           cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
1717           delete[] reinterpret_cast<uint8_t *>(&motion_transform);
1718
1719           // Disable instance transform if object uses motion transform already
1720           instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
1721
1722           // Get traversable handle to motion transform
1723           optixConvertPointerToTraversableHandle(context,
1724                                                  motion_transform_gpu,
1725                                                  OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
1726                                                  &instance.traversableHandle);
1727         }
1728         else {
1729           instance.traversableHandle = handle;
1730
1731           if (ob->get_geometry()->is_instanced()) {
1732             // Set transform matrix
1733             memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
1734           }
1735           else {
1736             // Disable instance transform if geometry already has it applied to vertex data
1737             instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
1738             // Non-instanced objects read ID from 'prim_object', so distinguish
1739             // them from instanced objects with the low bit set
1740             instance.instanceId |= 1;
1741           }
1742         }
1743       }
1744
1745       // Upload instance descriptions
1746 #  if OPTIX_ABI_VERSION < 41
1747       aabbs.resize(num_instances);
1748       aabbs.copy_to_device();
1749 #  endif
1750       instances.resize(num_instances);
1751       instances.copy_to_device();
1752
1753       // Build top-level acceleration structure (TLAS)
1754       OptixBuildInput build_input = {};
1755       build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
1756 #  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
1757       build_input.instanceArray.aabbs = aabbs.device_pointer;
1758       build_input.instanceArray.numAabbs = num_instances;
1759 #  endif
1760       build_input.instanceArray.instances = instances.device_pointer;
1761       build_input.instanceArray.numInstances = num_instances;
1762
1763       if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
1764         progress.set_error("Failed to build OptiX acceleration structure");
1765       }
1766       tlas_handle = bvh_optix->traversable_handle;
1767     }
1768   }
1769
1770   void const_copy_to(const char *name, void *host, size_t size) override
1771   {
1772     // Set constant memory for CUDA module
1773     // TODO(pmours): This is only used for tonemapping (see 'film_convert').
1774     //               Could be removed by moving those functions to filter CUDA module.
1775     CUDADevice::const_copy_to(name, host, size);
1776
1777     if (strcmp(name, "__data") == 0) {
1778       assert(size <= sizeof(KernelData));
1779
1780       // Update traversable handle (since it is different for each device on multi devices)
1781       KernelData *const data = (KernelData *)host;
1782       *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
1783
1784       update_launch_params(offsetof(KernelParams, data), host, size);
1785       return;
1786     }
1787
1788     // Update data storage pointers in launch parameters
1789 #  define KERNEL_TEX(data_type, tex_name) \
1790     if (strcmp(name, #tex_name) == 0) { \
1791       update_launch_params(offsetof(KernelParams, tex_name), host, size); \
1792       return; \
1793     }
1794 #  include "kernel/kernel_textures.h"
1795 #  undef KERNEL_TEX
1796   }
1797
1798   void update_launch_params(size_t offset, void *data, size_t data_size)
1799   {
1800     const CUDAContextScope scope(cuContext);
1801
1802     for (int i = 0; i < info.cpu_threads; ++i)
1803       check_result_cuda(
1804           cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
1805                        data,
1806                        data_size));
1807   }
1808
1809   void task_add(DeviceTask &task) override
1810   {
1811     // Upload texture information to device if it has changed since last launch
1812     load_texture_info();
1813
1814     if (task.type == DeviceTask::FILM_CONVERT) {
1815       // Execute in main thread because of OpenGL access
1816       film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
1817       return;
1818     }
1819
1820     if (task.type == DeviceTask::DENOISE_BUFFER) {
1821       // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
1822       task_pool.push([=] {
1823         DeviceTask task_copy = task;
1824         thread_run(task_copy, 0);
1825       });
1826       return;
1827     }
1828
1829     // Split task into smaller ones
1830     list<DeviceTask> tasks;
1831     task.split(tasks, info.cpu_threads);
1832
1833     // Queue tasks in internal task pool
1834     int task_index = 0;
1835     for (DeviceTask &task : tasks) {
1836       task_pool.push([=] {
1837         // Using task index parameter instead of thread index, since number of CUDA streams may
1838         // differ from number of threads
1839         DeviceTask task_copy = task;
1840         thread_run(task_copy, task_index);
1841       });
1842       task_index++;
1843     }
1844   }
1845
1846   void task_wait() override
1847   {
1848     // Wait for all queued tasks to finish
1849     task_pool.wait_work();
1850   }
1851
1852   void task_cancel() override
1853   {
1854     // Cancel any remaining tasks in the internal pool
1855     task_pool.cancel();
1856   }
1857 };
1858
1859 bool device_optix_init()
1860 {
1861   if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
1862     return true;  // Already initialized function table
1863
1864   // Need to initialize CUDA as well
1865   if (!device_cuda_init())
1866     return false;
1867
1868   const OptixResult result = optixInit();
1869
1870   if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
1871     VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
1872                "Please update to the latest driver first!";
1873     return false;
1874   }
1875   else if (result != OPTIX_SUCCESS) {
1876     VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
1877     return false;
1878   }
1879
1880   // Loaded OptiX successfully!
1881   return true;
1882 }
1883
1884 void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
1885 {
1886   devices.reserve(cuda_devices.size());
1887
1888   // Simply add all supported CUDA devices as OptiX devices again
1889   for (DeviceInfo info : cuda_devices) {
1890     assert(info.type == DEVICE_CUDA);
1891
1892     int major;
1893     cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
1894     if (major < 5) {
1895       continue;  // Only Maxwell and up are supported by OptiX
1896     }
1897
1898     info.type = DEVICE_OPTIX;
1899     info.id += "_OptiX";
1900     info.denoisers |= DENOISER_OPTIX;
1901     info.has_branched_path = false;
1902
1903     devices.push_back(info);
1904   }
1905 }
1906
1907 Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
1908 {
1909   return new OptiXDevice(info, stats, profiler, background);
1910 }
1911
1912 CCL_NAMESPACE_END
1913
1914 #endif