Merge remote-tracking branch 'origin/blender-v2.93-release'
[blender.git] / intern / cycles / device / device_optix.cpp
1 /*
2  * Copyright 2019, NVIDIA Corporation.
3  * Copyright 2019, Blender Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 #ifdef WITH_OPTIX
19
20 #  include "bvh/bvh.h"
21 #  include "bvh/bvh_optix.h"
22 #  include "device/cuda/device_cuda.h"
23 #  include "device/device_denoising.h"
24 #  include "device/device_intern.h"
25 #  include "render/buffers.h"
26 #  include "render/hair.h"
27 #  include "render/mesh.h"
28 #  include "render/object.h"
29 #  include "render/scene.h"
30 #  include "util/util_debug.h"
31 #  include "util/util_logging.h"
32 #  include "util/util_md5.h"
33 #  include "util/util_path.h"
34 #  include "util/util_progress.h"
35 #  include "util/util_time.h"
36
37 #  ifdef WITH_CUDA_DYNLOAD
38 #    include <cuew.h>
39 // Do not use CUDA SDK headers when using CUEW
40 #    define OPTIX_DONT_INCLUDE_CUDA
41 #  endif
42 #  include <optix_function_table_definition.h>
43 #  include <optix_stubs.h>
44
45 // TODO(pmours): Disable this once drivers have native support
46 #  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
47
48 CCL_NAMESPACE_BEGIN
49
50 /* Make sure this stays in sync with kernel_globals.h */
51 struct ShaderParams {
52   uint4 *input;
53   float4 *output;
54   int type;
55   int filter;
56   int sx;
57   int offset;
58   int sample;
59 };
60 struct KernelParams {
61   WorkTile tile;
62   KernelData data;
63   ShaderParams shader;
64 #  define KERNEL_TEX(type, name) const type *name;
65 #  include "kernel/kernel_textures.h"
66 #  undef KERNEL_TEX
67 };
68
69 #  define check_result_cuda(stmt) \
70     { \
71       CUresult res = stmt; \
72       if (res != CUDA_SUCCESS) { \
73         const char *name; \
74         cuGetErrorName(res, &name); \
75         set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
76         return; \
77       } \
78     } \
79     (void)0
80 #  define check_result_cuda_ret(stmt) \
81     { \
82       CUresult res = stmt; \
83       if (res != CUDA_SUCCESS) { \
84         const char *name; \
85         cuGetErrorName(res, &name); \
86         set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
87         return false; \
88       } \
89     } \
90     (void)0
91
92 #  define check_result_optix(stmt) \
93     { \
94       enum OptixResult res = stmt; \
95       if (res != OPTIX_SUCCESS) { \
96         const char *name = optixGetErrorName(res); \
97         set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
98         return; \
99       } \
100     } \
101     (void)0
102 #  define check_result_optix_ret(stmt) \
103     { \
104       enum OptixResult res = stmt; \
105       if (res != OPTIX_SUCCESS) { \
106         const char *name = optixGetErrorName(res); \
107         set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
108         return false; \
109       } \
110     } \
111     (void)0
112
113 #  define launch_filter_kernel(func_name, w, h, args) \
114     { \
115       CUfunction func; \
116       check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
117       check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
118       int threads; \
119       check_result_cuda_ret( \
120           cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
121       threads = (int)sqrt((float)threads); \
122       int xblocks = ((w) + threads - 1) / threads; \
123       int yblocks = ((h) + threads - 1) / threads; \
124       check_result_cuda_ret( \
125           cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
126     } \
127     (void)0
128
129 class OptiXDevice : public CUDADevice {
130
131   // List of OptiX program groups
132   enum {
133     PG_RGEN,
134     PG_MISS,
135     PG_HITD,  // Default hit group
136     PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
137     PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
138 #  if OPTIX_ABI_VERSION >= 36
139     PG_HITD_MOTION,
140     PG_HITS_MOTION,
141 #  endif
142     PG_BAKE,  // kernel_bake_evaluate
143     PG_DISP,  // kernel_displace_evaluate
144     PG_BACK,  // kernel_background_evaluate
145     PG_CALL,
146     NUM_PROGRAM_GROUPS = PG_CALL + 3
147   };
148
149   // List of OptiX pipelines
150   enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
151
152   // A single shader binding table entry
153   struct SbtRecord {
154     char header[OPTIX_SBT_RECORD_HEADER_SIZE];
155   };
156
157   // Information stored about CUDA memory allocations
158   struct CUDAMem {
159     bool free_map_host = false;
160     CUarray array = NULL;
161     CUtexObject texobject = 0;
162     bool use_mapped_host = false;
163   };
164
165   // Helper class to manage current CUDA context
166   struct CUDAContextScope {
167     CUDAContextScope(CUcontext ctx)
168     {
169       cuCtxPushCurrent(ctx);
170     }
171     ~CUDAContextScope()
172     {
173       cuCtxPopCurrent(NULL);
174     }
175   };
176
177   // Use a pool with multiple threads to support launches with multiple CUDA streams
178   TaskPool task_pool;
179
180   vector<CUstream> cuda_stream;
181   OptixDeviceContext context = NULL;
182
183   OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
184   OptixModule builtin_modules[2] = {};
185   OptixPipeline pipelines[NUM_PIPELINES] = {};
186
187   bool motion_blur = false;
188   device_vector<SbtRecord> sbt_data;
189   device_only_memory<KernelParams> launch_params;
190   OptixTraversableHandle tlas_handle = 0;
191
192   OptixDenoiser denoiser = NULL;
193   device_only_memory<unsigned char> denoiser_state;
194   int denoiser_input_passes = 0;
195
196   vector<device_only_memory<char>> delayed_free_bvh_memory;
197   thread_mutex delayed_free_bvh_mutex;
198
199  public:
200   OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
201       : CUDADevice(info_, stats_, profiler_, background_),
202         sbt_data(this, "__sbt", MEM_READ_ONLY),
203         launch_params(this, "__params", false),
204         denoiser_state(this, "__denoiser_state", true)
205   {
206     // Store number of CUDA streams in device info
207     info.cpu_threads = DebugFlags().optix.cuda_streams;
208
209     // Make the CUDA context current
210     if (!cuContext) {
211       return;  // Do not initialize if CUDA context creation failed already
212     }
213     const CUDAContextScope scope(cuContext);
214
215     // Create OptiX context for this device
216     OptixDeviceContextOptions options = {};
217 #  ifdef WITH_CYCLES_LOGGING
218     options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
219     options.logCallbackFunction =
220         [](unsigned int level, const char *, const char *message, void *) {
221           switch (level) {
222             case 1:
223               LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
224               break;
225             case 2:
226               LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
227               break;
228             case 3:
229               LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
230               break;
231             case 4:
232               LOG_IF(INFO, VLOG_IS_ON(1)) << message;
233               break;
234           }
235         };
236 #  endif
237 #  if OPTIX_ABI_VERSION >= 41 && defined(WITH_CYCLES_DEBUG)
238     options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
239 #  endif
240     check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
241 #  ifdef WITH_CYCLES_LOGGING
242     check_result_optix(optixDeviceContextSetLogCallback(
243         context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
244 #  endif
245
246     // Create launch streams
247     cuda_stream.resize(info.cpu_threads);
248     for (int i = 0; i < info.cpu_threads; ++i)
249       check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
250
251     // Fix weird compiler bug that assigns wrong size
252     launch_params.data_elements = sizeof(KernelParams);
253     // Allocate launch parameter buffer memory on device
254     launch_params.alloc_to_device(info.cpu_threads);
255   }
256   ~OptiXDevice()
257   {
258     // Stop processing any more tasks
259     task_pool.cancel();
260
261     // Make CUDA context current
262     const CUDAContextScope scope(cuContext);
263
264     free_bvh_memory_delayed();
265
266     sbt_data.free();
267     texture_info.free();
268     launch_params.free();
269     denoiser_state.free();
270
271     // Unload modules
272     if (optix_module != NULL)
273       optixModuleDestroy(optix_module);
274     for (unsigned int i = 0; i < 2; ++i)
275       if (builtin_modules[i] != NULL)
276         optixModuleDestroy(builtin_modules[i]);
277     for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
278       if (pipelines[i] != NULL)
279         optixPipelineDestroy(pipelines[i]);
280
281     // Destroy launch streams
282     for (CUstream stream : cuda_stream)
283       cuStreamDestroy(stream);
284
285     if (denoiser != NULL)
286       optixDenoiserDestroy(denoiser);
287
288     optixDeviceContextDestroy(context);
289   }
290
291  private:
292   bool show_samples() const override
293   {
294     // Only show samples if not rendering multiple tiles in parallel
295     return info.cpu_threads == 1;
296   }
297
298   BVHLayoutMask get_bvh_layout_mask() const override
299   {
300     // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
301     if (optix_module == NULL)
302       return CUDADevice::get_bvh_layout_mask();
303
304     // OptiX has its own internal acceleration structure format
305     return BVH_LAYOUT_OPTIX;
306   }
307
308   string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
309                                           bool filter,
310                                           bool /*split*/) override
311   {
312     // Split kernel is not supported in OptiX
313     string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
314         requested_features, filter, false);
315
316     // Add OptiX SDK include directory to include paths
317     const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
318     if (optix_sdk_path) {
319       common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
320     }
321
322     // Specialization for shader raytracing
323     if (requested_features.use_shader_raytrace) {
324       common_cflags += " --keep-device-functions";
325     }
326     else {
327       common_cflags += " -D __NO_SHADER_RAYTRACE__";
328     }
329
330     return common_cflags;
331   }
332
333   bool load_kernels(const DeviceRequestedFeatures &requested_features) override
334   {
335     if (have_error()) {
336       // Abort early if context creation failed already
337       return false;
338     }
339
340     // Load CUDA modules because we need some of the utility kernels
341     if (!CUDADevice::load_kernels(requested_features)) {
342       return false;
343     }
344
345     // Baking is currently performed using CUDA, so no need to load OptiX kernels
346     if (requested_features.use_baking) {
347       return true;
348     }
349
350     const CUDAContextScope scope(cuContext);
351
352     // Unload existing OptiX module and pipelines first
353     if (optix_module != NULL) {
354       optixModuleDestroy(optix_module);
355       optix_module = NULL;
356     }
357     for (unsigned int i = 0; i < 2; ++i) {
358       if (builtin_modules[i] != NULL) {
359         optixModuleDestroy(builtin_modules[i]);
360         builtin_modules[i] = NULL;
361       }
362     }
363     for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
364       if (pipelines[i] != NULL) {
365         optixPipelineDestroy(pipelines[i]);
366         pipelines[i] = NULL;
367       }
368     }
369
370     OptixModuleCompileOptions module_options = {};
371     module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
372 #  ifdef WITH_CYCLES_DEBUG
373     module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
374     module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
375 #  else
376     module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
377     module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
378 #  endif
379
380 #  if OPTIX_ABI_VERSION >= 41
381     module_options.boundValues = nullptr;
382     module_options.numBoundValues = 0;
383 #  endif
384
385     OptixPipelineCompileOptions pipeline_options = {};
386     // Default to no motion blur and two-level graph, since it is the fastest option
387     pipeline_options.usesMotionBlur = false;
388     pipeline_options.traversableGraphFlags =
389         OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
390     pipeline_options.numPayloadValues = 6;
391     pipeline_options.numAttributeValues = 2;  // u, v
392     pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
393     pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
394
395 #  if OPTIX_ABI_VERSION >= 36
396     pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
397     if (requested_features.use_hair) {
398       if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
399         pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
400       }
401       else {
402         pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
403       }
404     }
405 #  endif
406
407     // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
408     // This is necessary since objects may be reported to have motion if the Vector pass is
409     // active, but may still need to be rendered without motion blur if that isn't active as well
410     motion_blur = requested_features.use_object_motion;
411
412     if (motion_blur) {
413       pipeline_options.usesMotionBlur = true;
414       // Motion blur can insert motion transforms into the traversal graph
415       // It is no longer a two-level graph then, so need to set flags to allow any configuration
416       pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
417     }
418
419     {  // Load and compile PTX module with OptiX kernels
420       string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
421                                                    "lib/kernel_optix_shader_raytrace.ptx" :
422                                                    "lib/kernel_optix.ptx");
423       if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
424         if (!getenv("OPTIX_ROOT_DIR")) {
425           set_error(
426               "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
427               "the Optix SDK to be able to compile Optix kernels on demand).");
428           return false;
429         }
430         ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
431       }
432       if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
433         set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
434         return false;
435       }
436
437       check_result_optix_ret(optixModuleCreateFromPTX(context,
438                                                       &module_options,
439                                                       &pipeline_options,
440                                                       ptx_data.data(),
441                                                       ptx_data.size(),
442                                                       nullptr,
443                                                       0,
444                                                       &optix_module));
445     }
446
447     // Create program groups
448     OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
449     OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
450     OptixProgramGroupOptions group_options = {};  // There are no options currently
451     group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
452     group_descs[PG_RGEN].raygen.module = optix_module;
453     // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
454     group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
455     group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
456     group_descs[PG_MISS].miss.module = optix_module;
457     group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
458     group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
459     group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
460     group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
461     group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
462     group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
463     group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
464     group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
465     group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
466
467     if (requested_features.use_hair) {
468       group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
469       group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
470
471       // Add curve intersection programs
472       if (requested_features.use_hair_thick) {
473         // Slower programs for thick hair since that also slows down ribbons.
474         // Ideally this should not be needed.
475         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
476         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
477       }
478       else {
479         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
480         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
481       }
482
483 #  if OPTIX_ABI_VERSION >= 36
484       if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
485         OptixBuiltinISOptions builtin_options = {};
486         builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
487         builtin_options.usesMotionBlur = false;
488
489         check_result_optix_ret(optixBuiltinISModuleGet(
490             context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
491
492         group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
493         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
494         group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
495         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
496
497         if (motion_blur) {
498           builtin_options.usesMotionBlur = true;
499
500           check_result_optix_ret(optixBuiltinISModuleGet(
501               context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
502
503           group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
504           group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
505           group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
506           group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
507         }
508       }
509 #  endif
510     }
511
512     if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
513       // Add hit group for local intersections
514       group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
515       group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
516       group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
517     }
518
519     if (requested_features.use_baking) {
520       group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
521       group_descs[PG_BAKE].raygen.module = optix_module;
522       group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
523     }
524
525     if (requested_features.use_true_displacement) {
526       group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
527       group_descs[PG_DISP].raygen.module = optix_module;
528       group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
529     }
530
531     if (requested_features.use_background_light) {
532       group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
533       group_descs[PG_BACK].raygen.module = optix_module;
534       group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
535     }
536
537     // Shader raytracing replaces some functions with direct callables
538     if (requested_features.use_shader_raytrace) {
539       group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
540       group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
541       group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
542       group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
543       group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
544       group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
545           "__direct_callable__kernel_volume_shadow";
546       group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
547       group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
548       group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
549           "__direct_callable__subsurface_scatter_multi_setup";
550     }
551
552     check_result_optix_ret(optixProgramGroupCreate(
553         context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
554
555     // Get program stack sizes
556     OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
557     // Set up SBT, which in this case is used only to select between different programs
558     sbt_data.alloc(NUM_PROGRAM_GROUPS);
559     memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
560     for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
561       check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
562       check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
563     }
564     sbt_data.copy_to_device();  // Upload SBT to device
565
566     // Calculate maximum trace continuation stack size
567     unsigned int trace_css = stack_size[PG_HITD].cssCH;
568     // This is based on the maximum of closest-hit and any-hit/intersection programs
569     trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
570     trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
571     trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
572 #  if OPTIX_ABI_VERSION >= 36
573     trace_css = std::max(trace_css,
574                          stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
575     trace_css = std::max(trace_css,
576                          stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
577 #  endif
578
579     OptixPipelineLinkOptions link_options = {};
580     link_options.maxTraceDepth = 1;
581 #  ifdef WITH_CYCLES_DEBUG
582     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
583 #  else
584     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
585 #  endif
586 #  if OPTIX_ABI_VERSION < 24
587     link_options.overrideUsesMotionBlur = motion_blur;
588 #  endif
589
590     {  // Create path tracing pipeline
591       vector<OptixProgramGroup> pipeline_groups;
592       pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
593       pipeline_groups.push_back(groups[PG_RGEN]);
594       pipeline_groups.push_back(groups[PG_MISS]);
595       pipeline_groups.push_back(groups[PG_HITD]);
596       pipeline_groups.push_back(groups[PG_HITS]);
597       pipeline_groups.push_back(groups[PG_HITL]);
598 #  if OPTIX_ABI_VERSION >= 36
599       if (motion_blur) {
600         pipeline_groups.push_back(groups[PG_HITD_MOTION]);
601         pipeline_groups.push_back(groups[PG_HITS_MOTION]);
602       }
603 #  endif
604       if (requested_features.use_shader_raytrace) {
605         pipeline_groups.push_back(groups[PG_CALL + 0]);
606         pipeline_groups.push_back(groups[PG_CALL + 1]);
607         pipeline_groups.push_back(groups[PG_CALL + 2]);
608       }
609
610       check_result_optix_ret(optixPipelineCreate(context,
611                                                  &pipeline_options,
612                                                  &link_options,
613                                                  pipeline_groups.data(),
614                                                  pipeline_groups.size(),
615                                                  nullptr,
616                                                  0,
617                                                  &pipelines[PIP_PATH_TRACE]));
618
619       // Combine ray generation and trace continuation stack size
620       const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
621       // Max direct callable depth is one of the following, so combine accordingly
622       // - __raygen__ -> svm_eval_nodes
623       // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
624       // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
625       const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
626                                std::max(stack_size[PG_CALL + 1].dssDC,
627                                         stack_size[PG_CALL + 2].dssDC);
628
629       // Set stack size depending on pipeline options
630       check_result_optix_ret(
631           optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
632                                     0,
633                                     requested_features.use_shader_raytrace ? dss : 0,
634                                     css,
635                                     motion_blur ? 3 : 2));
636     }
637
638     // Only need to create shader evaluation pipeline if one of these features is used:
639     const bool use_shader_eval_pipeline = requested_features.use_baking ||
640                                           requested_features.use_background_light ||
641                                           requested_features.use_true_displacement;
642
643     if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
644       vector<OptixProgramGroup> pipeline_groups;
645       pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
646       pipeline_groups.push_back(groups[PG_BAKE]);
647       pipeline_groups.push_back(groups[PG_DISP]);
648       pipeline_groups.push_back(groups[PG_BACK]);
649       pipeline_groups.push_back(groups[PG_MISS]);
650       pipeline_groups.push_back(groups[PG_HITD]);
651       pipeline_groups.push_back(groups[PG_HITS]);
652       pipeline_groups.push_back(groups[PG_HITL]);
653 #  if OPTIX_ABI_VERSION >= 36
654       if (motion_blur) {
655         pipeline_groups.push_back(groups[PG_HITD_MOTION]);
656         pipeline_groups.push_back(groups[PG_HITS_MOTION]);
657       }
658 #  endif
659       if (requested_features.use_shader_raytrace) {
660         pipeline_groups.push_back(groups[PG_CALL + 0]);
661         pipeline_groups.push_back(groups[PG_CALL + 1]);
662         pipeline_groups.push_back(groups[PG_CALL + 2]);
663       }
664
665       check_result_optix_ret(optixPipelineCreate(context,
666                                                  &pipeline_options,
667                                                  &link_options,
668                                                  pipeline_groups.data(),
669                                                  pipeline_groups.size(),
670                                                  nullptr,
671                                                  0,
672                                                  &pipelines[PIP_SHADER_EVAL]));
673
674       // Calculate continuation stack size based on the maximum of all ray generation stack sizes
675       const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
676                                         std::max(stack_size[PG_DISP].cssRG,
677                                                  stack_size[PG_BACK].cssRG)) +
678                                link_options.maxTraceDepth * trace_css;
679       const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
680                                std::max(stack_size[PG_CALL + 1].dssDC,
681                                         stack_size[PG_CALL + 2].dssDC);
682
683       check_result_optix_ret(
684           optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
685                                     0,
686                                     requested_features.use_shader_raytrace ? dss : 0,
687                                     css,
688                                     motion_blur ? 3 : 2));
689     }
690
691     // Clean up program group objects
692     for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
693       optixProgramGroupDestroy(groups[i]);
694     }
695
696     return true;
697   }
698
699   void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
700   {
701     if (have_error())
702       return;  // Abort early if there was an error previously
703
704     if (task.type == DeviceTask::RENDER) {
705       if (thread_index != 0) {
706         // Only execute denoising in a single thread (see also 'task_add')
707         task.tile_types &= ~RenderTile::DENOISE;
708       }
709
710       RenderTile tile;
711       while (task.acquire_tile(this, tile, task.tile_types)) {
712         if (tile.task == RenderTile::PATH_TRACE)
713           launch_render(task, tile, thread_index);
714         else if (tile.task == RenderTile::BAKE) {
715           // Perform baking using CUDA, since it is not currently implemented in OptiX
716           device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
717           CUDADevice::render(task, tile, work_tiles);
718         }
719         else if (tile.task == RenderTile::DENOISE)
720           launch_denoise(task, tile);
721         task.release_tile(tile);
722         if (task.get_cancel() && !task.need_finish_queue)
723           break;  // User requested cancellation
724         else if (have_error())
725           break;  // Abort rendering when encountering an error
726       }
727     }
728     else if (task.type == DeviceTask::SHADER) {
729       launch_shader_eval(task, thread_index);
730     }
731     else if (task.type == DeviceTask::DENOISE_BUFFER) {
732       // Set up a single tile that covers the whole task and denoise it
733       RenderTile tile;
734       tile.x = task.x;
735       tile.y = task.y;
736       tile.w = task.w;
737       tile.h = task.h;
738       tile.buffer = task.buffer;
739       tile.num_samples = task.num_samples;
740       tile.start_sample = task.sample;
741       tile.offset = task.offset;
742       tile.stride = task.stride;
743       tile.buffers = task.buffers;
744
745       launch_denoise(task, tile);
746     }
747   }
748
749   void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
750   {
751     assert(thread_index < launch_params.data_size);
752
753     // Keep track of total render time of this tile
754     const scoped_timer timer(&rtile.buffers->render_time);
755
756     WorkTile wtile;
757     wtile.x = rtile.x;
758     wtile.y = rtile.y;
759     wtile.w = rtile.w;
760     wtile.h = rtile.h;
761     wtile.offset = rtile.offset;
762     wtile.stride = rtile.stride;
763     wtile.buffer = (float *)rtile.buffer;
764
765     const int end_sample = rtile.start_sample + rtile.num_samples;
766     // Keep this number reasonable to avoid running into TDRs
767     int step_samples = (info.display_device ? 8 : 32);
768
769     // Offset into launch params buffer so that streams use separate data
770     device_ptr launch_params_ptr = launch_params.device_pointer +
771                                    thread_index * launch_params.data_elements;
772
773     const CUDAContextScope scope(cuContext);
774
775     for (int sample = rtile.start_sample; sample < end_sample;) {
776       // Copy work tile information to device
777       wtile.start_sample = sample;
778       wtile.num_samples = step_samples;
779       if (task.adaptive_sampling.use) {
780         wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
781       }
782       wtile.num_samples = min(wtile.num_samples, end_sample - sample);
783       device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
784       check_result_cuda(
785           cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
786
787       OptixShaderBindingTable sbt_params = {};
788       sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
789       sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
790       sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
791       sbt_params.missRecordCount = 1;
792       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
793       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
794 #  if OPTIX_ABI_VERSION >= 36
795       sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
796 #  else
797       sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
798 #  endif
799       sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
800       sbt_params.callablesRecordCount = 3;
801       sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
802
803       // Launch the ray generation program
804       check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
805                                      cuda_stream[thread_index],
806                                      launch_params_ptr,
807                                      launch_params.data_elements,
808                                      &sbt_params,
809                                      // Launch with samples close to each other for better locality
810                                      wtile.w * wtile.num_samples,
811                                      wtile.h,
812                                      1));
813
814       // Run the adaptive sampling kernels at selected samples aligned to step samples.
815       uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
816       if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
817         adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
818       }
819
820       // Wait for launch to finish
821       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
822
823       // Update current sample, so it is displayed correctly
824       sample += wtile.num_samples;
825       rtile.sample = sample;
826       // Update task progress after the kernel completed rendering
827       task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
828
829       if (task.get_cancel() && !task.need_finish_queue)
830         return;  // Cancel rendering
831     }
832
833     // Finalize adaptive sampling
834     if (task.adaptive_sampling.use) {
835       device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
836       adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
837       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
838       task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
839     }
840   }
841
842   bool launch_denoise(DeviceTask &task, RenderTile &rtile)
843   {
844     // Update current sample (for display and NLM denoising task)
845     rtile.sample = rtile.start_sample + rtile.num_samples;
846
847     // Make CUDA context current now, since it is used for both denoising tasks
848     const CUDAContextScope scope(cuContext);
849
850     // Choose between OptiX and NLM denoising
851     if (task.denoising.type == DENOISER_OPTIX) {
852       // Map neighboring tiles onto this device, indices are as following:
853       // Where index 4 is the center tile and index 9 is the target for the result.
854       //   0 1 2
855       //   3 4 5
856       //   6 7 8  9
857       RenderTileNeighbors neighbors(rtile);
858       task.map_neighbor_tiles(neighbors, this);
859       RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
860       RenderTile &target_tile = neighbors.target;
861       rtile = center_tile;  // Tile may have been modified by mapping code
862
863       // Calculate size of the tile to denoise (including overlap)
864       int4 rect = center_tile.bounds();
865       // Overlap between tiles has to be at least 64 pixels
866       // TODO(pmours): Query this value from OptiX
867       rect = rect_expand(rect, 64);
868       int4 clip_rect = neighbors.bounds();
869       rect = rect_clip(rect, clip_rect);
870       int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
871       int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
872
873       // Calculate byte offsets and strides
874       int pixel_stride = task.pass_stride * (int)sizeof(float);
875       int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
876       const int pass_offset[3] = {
877           (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
878           (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
879           (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
880
881       // Start with the current tile pointer offset
882       int input_stride = pixel_stride;
883       device_ptr input_ptr = rtile.buffer + pixel_offset;
884
885       // Copy tile data into a common buffer if necessary
886       device_only_memory<float> input(this, "denoiser input", true);
887       device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
888
889       bool contiguous_memory = true;
890       for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
891         if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
892           contiguous_memory = false;
893         }
894       }
895
896       if (contiguous_memory) {
897         // Tiles are in continous memory, so can just subtract overlap offset
898         input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
899         // Stride covers the whole width of the image and not just a single tile
900         input_stride *= rtile.stride;
901       }
902       else {
903         // Adjacent tiles are in separate memory regions, so need to copy them into a single one
904         input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
905         // Start with the new input buffer
906         input_ptr = input.device_pointer;
907         // Stride covers the width of the new input buffer, which includes tile width and overlap
908         input_stride *= rect_size.x;
909
910         TileInfo *tile_info = tile_info_mem.alloc(1);
911         for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
912           tile_info->offsets[i] = neighbors.tiles[i].offset;
913           tile_info->strides[i] = neighbors.tiles[i].stride;
914           tile_info->buffers[i] = neighbors.tiles[i].buffer;
915         }
916         tile_info->x[0] = neighbors.tiles[3].x;
917         tile_info->x[1] = neighbors.tiles[4].x;
918         tile_info->x[2] = neighbors.tiles[5].x;
919         tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
920         tile_info->y[0] = neighbors.tiles[1].y;
921         tile_info->y[1] = neighbors.tiles[4].y;
922         tile_info->y[2] = neighbors.tiles[7].y;
923         tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
924         tile_info_mem.copy_to_device();
925
926         void *args[] = {
927             &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
928         launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
929       }
930
931 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
932       device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
933       input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
934
935       void *input_args[] = {&input_rgb.device_pointer,
936                             &input_ptr,
937                             &rect_size.x,
938                             &rect_size.y,
939                             &input_stride,
940                             &task.pass_stride,
941                             const_cast<int *>(pass_offset),
942                             &task.denoising.input_passes,
943                             &rtile.sample};
944       launch_filter_kernel(
945           "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
946
947       input_ptr = input_rgb.device_pointer;
948       pixel_stride = 3 * sizeof(float);
949       input_stride = rect_size.x * pixel_stride;
950 #  endif
951
952       const bool recreate_denoiser = (denoiser == NULL) ||
953                                      (task.denoising.input_passes != denoiser_input_passes);
954       if (recreate_denoiser) {
955         // Destroy existing handle before creating new one
956         if (denoiser != NULL) {
957           optixDenoiserDestroy(denoiser);
958         }
959
960         // Create OptiX denoiser handle on demand when it is first used
961         OptixDenoiserOptions denoiser_options = {};
962         assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
963 #  if OPTIX_ABI_VERSION >= 47
964         denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
965         denoiser_options.guideNormal = task.denoising.input_passes >= 3;
966         check_result_optix_ret(optixDenoiserCreate(
967             context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
968 #  else
969         denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
970             OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
971 #    if OPTIX_ABI_VERSION < 28
972         denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
973 #    endif
974         check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
975         check_result_optix_ret(
976             optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
977 #  endif
978
979         // OptiX denoiser handle was created with the requested number of input passes
980         denoiser_input_passes = task.denoising.input_passes;
981       }
982
983       OptixDenoiserSizes sizes = {};
984       check_result_optix_ret(
985           optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
986
987 #  if OPTIX_ABI_VERSION < 28
988       const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
989 #  else
990       const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
991 #  endif
992       const size_t scratch_offset = sizes.stateSizeInBytes;
993
994       // Allocate denoiser state if tile size has changed since last setup
995       if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
996                                 denoiser_state.data_height != rect_size.y)) {
997         denoiser_state.alloc_to_device(scratch_offset + scratch_size);
998
999         // Initialize denoiser state for the current tile size
1000         check_result_optix_ret(optixDenoiserSetup(denoiser,
1001                                                   0,
1002                                                   rect_size.x,
1003                                                   rect_size.y,
1004                                                   denoiser_state.device_pointer,
1005                                                   scratch_offset,
1006                                                   denoiser_state.device_pointer + scratch_offset,
1007                                                   scratch_size));
1008
1009         denoiser_state.data_width = rect_size.x;
1010         denoiser_state.data_height = rect_size.y;
1011       }
1012
1013       // Set up input and output layer information
1014       OptixImage2D input_layers[3] = {};
1015       OptixImage2D output_layers[1] = {};
1016
1017       for (int i = 0; i < 3; ++i) {
1018 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
1019         input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
1020 #  else
1021         input_layers[i].data = input_ptr + pass_offset[i];
1022 #  endif
1023         input_layers[i].width = rect_size.x;
1024         input_layers[i].height = rect_size.y;
1025         input_layers[i].rowStrideInBytes = input_stride;
1026         input_layers[i].pixelStrideInBytes = pixel_stride;
1027         input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
1028       }
1029
1030 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
1031       output_layers[0].data = input_ptr;
1032       output_layers[0].width = rect_size.x;
1033       output_layers[0].height = rect_size.y;
1034       output_layers[0].rowStrideInBytes = input_stride;
1035       output_layers[0].pixelStrideInBytes = pixel_stride;
1036       int2 output_offset = overlap_offset;
1037       overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
1038 #  else
1039       output_layers[0].data = target_tile.buffer + pixel_offset;
1040       output_layers[0].width = target_tile.w;
1041       output_layers[0].height = target_tile.h;
1042       output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
1043       output_layers[0].pixelStrideInBytes = pixel_stride;
1044 #  endif
1045       output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
1046
1047 #  if OPTIX_ABI_VERSION >= 47
1048       OptixDenoiserLayer image_layers = {};
1049       image_layers.input = input_layers[0];
1050       image_layers.output = output_layers[0];
1051
1052       OptixDenoiserGuideLayer guide_layers = {};
1053       guide_layers.albedo = input_layers[1];
1054       guide_layers.normal = input_layers[2];
1055 #  endif
1056
1057       // Finally run denonising
1058       OptixDenoiserParams params = {};  // All parameters are disabled/zero
1059 #  if OPTIX_ABI_VERSION >= 47
1060       check_result_optix_ret(optixDenoiserInvoke(denoiser,
1061                                                  NULL,
1062                                                  &params,
1063                                                  denoiser_state.device_pointer,
1064                                                  scratch_offset,
1065                                                  &guide_layers,
1066                                                  &image_layers,
1067                                                  1,
1068                                                  overlap_offset.x,
1069                                                  overlap_offset.y,
1070                                                  denoiser_state.device_pointer + scratch_offset,
1071                                                  scratch_size));
1072 #  else
1073       check_result_optix_ret(optixDenoiserInvoke(denoiser,
1074                                                  NULL,
1075                                                  &params,
1076                                                  denoiser_state.device_pointer,
1077                                                  scratch_offset,
1078                                                  input_layers,
1079                                                  task.denoising.input_passes,
1080                                                  overlap_offset.x,
1081                                                  overlap_offset.y,
1082                                                  output_layers,
1083                                                  denoiser_state.device_pointer + scratch_offset,
1084                                                  scratch_size));
1085 #  endif
1086
1087 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
1088       void *output_args[] = {&input_ptr,
1089                              &target_tile.buffer,
1090                              &output_offset.x,
1091                              &output_offset.y,
1092                              &rect_size.x,
1093                              &rect_size.y,
1094                              &target_tile.x,
1095                              &target_tile.y,
1096                              &target_tile.w,
1097                              &target_tile.h,
1098                              &target_tile.offset,
1099                              &target_tile.stride,
1100                              &task.pass_stride,
1101                              &rtile.sample};
1102       launch_filter_kernel(
1103           "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
1104 #  endif
1105
1106       check_result_cuda_ret(cuStreamSynchronize(0));
1107
1108       task.unmap_neighbor_tiles(neighbors, this);
1109     }
1110     else {
1111       // Run CUDA denoising kernels
1112       DenoisingTask denoising(this, task);
1113       CUDADevice::denoise(rtile, denoising);
1114     }
1115
1116     // Update task progress after the denoiser completed processing
1117     task.update_progress(&rtile, rtile.w * rtile.h);
1118
1119     return true;
1120   }
1121
1122   void launch_shader_eval(DeviceTask &task, int thread_index)
1123   {
1124     unsigned int rgen_index = PG_BACK;
1125     if (task.shader_eval_type >= SHADER_EVAL_BAKE)
1126       rgen_index = PG_BAKE;
1127     if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
1128       rgen_index = PG_DISP;
1129
1130     const CUDAContextScope scope(cuContext);
1131
1132     device_ptr launch_params_ptr = launch_params.device_pointer +
1133                                    thread_index * launch_params.data_elements;
1134
1135     for (int sample = 0; sample < task.num_samples; ++sample) {
1136       ShaderParams params;
1137       params.input = (uint4 *)task.shader_input;
1138       params.output = (float4 *)task.shader_output;
1139       params.type = task.shader_eval_type;
1140       params.filter = task.shader_filter;
1141       params.sx = task.shader_x;
1142       params.offset = task.offset;
1143       params.sample = sample;
1144
1145       check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
1146                                           &params,
1147                                           sizeof(params),
1148                                           cuda_stream[thread_index]));
1149
1150       OptixShaderBindingTable sbt_params = {};
1151       sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
1152       sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
1153       sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
1154       sbt_params.missRecordCount = 1;
1155       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
1156       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
1157 #  if OPTIX_ABI_VERSION >= 36
1158       sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
1159 #  else
1160       sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
1161 #  endif
1162       sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
1163       sbt_params.callablesRecordCount = 3;
1164       sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
1165
1166       check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
1167                                      cuda_stream[thread_index],
1168                                      launch_params_ptr,
1169                                      launch_params.data_elements,
1170                                      &sbt_params,
1171                                      task.shader_w,
1172                                      1,
1173                                      1));
1174
1175       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
1176
1177       task.update_progress(NULL);
1178     }
1179   }
1180
1181   bool build_optix_bvh(BVHOptiX *bvh,
1182                        OptixBuildOperation operation,
1183                        const OptixBuildInput &build_input,
1184                        uint16_t num_motion_steps)
1185   {
1186     /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
1187      * from running out of memory (since both original and compacted acceleration structure memory
1188      * may be allocated at the same time for the duration of this function). The builds would
1189      * otherwise happen on the same CUDA stream anyway. */
1190     static thread_mutex mutex;
1191     thread_scoped_lock lock(mutex);
1192
1193     const CUDAContextScope scope(cuContext);
1194
1195     // Compute memory usage
1196     OptixAccelBufferSizes sizes = {};
1197     OptixAccelBuildOptions options = {};
1198     options.operation = operation;
1199     if (background) {
1200       // Prefer best performance and lowest memory consumption in background
1201       options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
1202     }
1203     else {
1204       // Prefer fast updates in viewport
1205       options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1206     }
1207
1208     options.motionOptions.numKeys = num_motion_steps;
1209     options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
1210     options.motionOptions.timeBegin = 0.0f;
1211     options.motionOptions.timeEnd = 1.0f;
1212
1213     check_result_optix_ret(
1214         optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
1215
1216     // Allocate required output buffers
1217     device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
1218     temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
1219     if (!temp_mem.device_pointer)
1220       return false;  // Make sure temporary memory allocation succeeded
1221
1222     // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
1223     device_only_memory<char> &out_data = bvh->as_data;
1224     if (operation == OPTIX_BUILD_OPERATION_BUILD) {
1225       assert(out_data.device == this);
1226       out_data.alloc_to_device(sizes.outputSizeInBytes);
1227       if (!out_data.device_pointer)
1228         return false;
1229     }
1230     else {
1231       assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
1232     }
1233
1234     // Finally build the acceleration structure
1235     OptixAccelEmitDesc compacted_size_prop = {};
1236     compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
1237     // A tiny space was allocated for this property at the end of the temporary buffer above
1238     // Make sure this pointer is 8-byte aligned
1239     compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
1240
1241     OptixTraversableHandle out_handle = 0;
1242     check_result_optix_ret(optixAccelBuild(context,
1243                                            NULL,
1244                                            &options,
1245                                            &build_input,
1246                                            1,
1247                                            temp_mem.device_pointer,
1248                                            sizes.tempSizeInBytes,
1249                                            out_data.device_pointer,
1250                                            sizes.outputSizeInBytes,
1251                                            &out_handle,
1252                                            background ? &compacted_size_prop : NULL,
1253                                            background ? 1 : 0));
1254     bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1255
1256     // Wait for all operations to finish
1257     check_result_cuda_ret(cuStreamSynchronize(NULL));
1258
1259     // Compact acceleration structure to save memory (do not do this in viewport for faster builds)
1260     if (background) {
1261       uint64_t compacted_size = sizes.outputSizeInBytes;
1262       check_result_cuda_ret(
1263           cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
1264
1265       // Temporary memory is no longer needed, so free it now to make space
1266       temp_mem.free();
1267
1268       // There is no point compacting if the size does not change
1269       if (compacted_size < sizes.outputSizeInBytes) {
1270         device_only_memory<char> compacted_data(this, "optix compacted as", false);
1271         compacted_data.alloc_to_device(compacted_size);
1272         if (!compacted_data.device_pointer)
1273           // Do not compact if memory allocation for compacted acceleration structure fails
1274           // Can just use the uncompacted one then, so succeed here regardless
1275           return true;
1276
1277         check_result_optix_ret(optixAccelCompact(context,
1278                                                  NULL,
1279                                                  out_handle,
1280                                                  compacted_data.device_pointer,
1281                                                  compacted_size,
1282                                                  &out_handle));
1283         bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1284
1285         // Wait for compaction to finish
1286         check_result_cuda_ret(cuStreamSynchronize(NULL));
1287
1288         std::swap(out_data.device_size, compacted_data.device_size);
1289         std::swap(out_data.device_pointer, compacted_data.device_pointer);
1290         // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
1291       }
1292     }
1293
1294     return true;
1295   }
1296
1297   void build_bvh(BVH *bvh, Progress &progress, bool refit) override
1298   {
1299     if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
1300       /* For baking CUDA is used, build appropriate BVH for that. */
1301       Device::build_bvh(bvh, progress, refit);
1302       return;
1303     }
1304
1305     free_bvh_memory_delayed();
1306
1307     BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1308
1309     progress.set_substatus("Building OptiX acceleration structure");
1310
1311     if (!bvh->params.top_level) {
1312       assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
1313
1314       // Refit is only possible in viewport for now (because AS is built with
1315       // OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above)
1316       OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
1317       if (refit && !background) {
1318         assert(bvh_optix->traversable_handle != 0);
1319         operation = OPTIX_BUILD_OPERATION_UPDATE;
1320       }
1321       else {
1322         bvh_optix->as_data.free();
1323         bvh_optix->traversable_handle = 0;
1324       }
1325
1326       // Build bottom level acceleration structures (BLAS)
1327       Geometry *const geom = bvh->geometry[0];
1328       if (geom->geometry_type == Geometry::HAIR) {
1329         // Build BLAS for curve primitives
1330         Hair *const hair = static_cast<Hair *const>(geom);
1331         if (hair->num_curves() == 0) {
1332           return;
1333         }
1334
1335         const size_t num_segments = hair->num_segments();
1336
1337         size_t num_motion_steps = 1;
1338         Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1339         if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
1340           num_motion_steps = hair->get_motion_steps();
1341         }
1342
1343         device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1344 #  if OPTIX_ABI_VERSION >= 36
1345         device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1346         device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1347         // Four control points for each curve segment
1348         const size_t num_vertices = num_segments * 4;
1349         if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1350           index_data.alloc(num_segments);
1351           vertex_data.alloc(num_vertices * num_motion_steps);
1352         }
1353         else
1354 #  endif
1355           aabb_data.alloc(num_segments * num_motion_steps);
1356
1357         // Get AABBs for each motion step
1358         for (size_t step = 0; step < num_motion_steps; ++step) {
1359           // The center step for motion vertices is not stored in the attribute
1360           const float3 *keys = hair->get_curve_keys().data();
1361           size_t center_step = (num_motion_steps - 1) / 2;
1362           if (step != center_step) {
1363             size_t attr_offset = (step > center_step) ? step - 1 : step;
1364             // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
1365             keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
1366           }
1367
1368           for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
1369             const Hair::Curve curve = hair->get_curve(j);
1370 #  if OPTIX_ABI_VERSION >= 36
1371             const array<float> &curve_radius = hair->get_curve_radius();
1372 #  endif
1373
1374             for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
1375 #  if OPTIX_ABI_VERSION >= 36
1376               if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1377                 int k0 = curve.first_key + segment;
1378                 int k1 = k0 + 1;
1379                 int ka = max(k0 - 1, curve.first_key);
1380                 int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
1381
1382                 const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
1383                 const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
1384                 const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
1385                 const float4 pw = make_float4(
1386                     curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
1387
1388                 // Convert Catmull-Rom data to Bezier spline
1389                 static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
1390                 static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
1391                 static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
1392                 static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
1393
1394                 index_data[i] = i * 4;
1395                 float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
1396                 v[0] = make_float4(
1397                     dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
1398                 v[1] = make_float4(
1399                     dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
1400                 v[2] = make_float4(
1401                     dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
1402                 v[3] = make_float4(
1403                     dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
1404               }
1405               else
1406 #  endif
1407               {
1408                 BoundBox bounds = BoundBox::empty;
1409                 curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
1410
1411                 const size_t index = step * num_segments + i;
1412                 aabb_data[index].minX = bounds.min.x;
1413                 aabb_data[index].minY = bounds.min.y;
1414                 aabb_data[index].minZ = bounds.min.z;
1415                 aabb_data[index].maxX = bounds.max.x;
1416                 aabb_data[index].maxY = bounds.max.y;
1417                 aabb_data[index].maxZ = bounds.max.z;
1418               }
1419             }
1420           }
1421         }
1422
1423         // Upload AABB data to GPU
1424         aabb_data.copy_to_device();
1425 #  if OPTIX_ABI_VERSION >= 36
1426         index_data.copy_to_device();
1427         vertex_data.copy_to_device();
1428 #  endif
1429
1430         vector<device_ptr> aabb_ptrs;
1431         aabb_ptrs.reserve(num_motion_steps);
1432 #  if OPTIX_ABI_VERSION >= 36
1433         vector<device_ptr> width_ptrs;
1434         vector<device_ptr> vertex_ptrs;
1435         width_ptrs.reserve(num_motion_steps);
1436         vertex_ptrs.reserve(num_motion_steps);
1437 #  endif
1438         for (size_t step = 0; step < num_motion_steps; ++step) {
1439           aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
1440 #  if OPTIX_ABI_VERSION >= 36
1441           const device_ptr base_ptr = vertex_data.device_pointer +
1442                                       step * num_vertices * sizeof(float4);
1443           width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
1444           vertex_ptrs.push_back(base_ptr);
1445 #  endif
1446         }
1447
1448         // Force a single any-hit call, so shadow record-all behavior works correctly
1449         unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1450         OptixBuildInput build_input = {};
1451 #  if OPTIX_ABI_VERSION >= 36
1452         if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1453           build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
1454           build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
1455           build_input.curveArray.numPrimitives = num_segments;
1456           build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1457           build_input.curveArray.numVertices = num_vertices;
1458           build_input.curveArray.vertexStrideInBytes = sizeof(float4);
1459           build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
1460           build_input.curveArray.widthStrideInBytes = sizeof(float4);
1461           build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
1462           build_input.curveArray.indexStrideInBytes = sizeof(int);
1463           build_input.curveArray.flag = build_flags;
1464           build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
1465         }
1466         else
1467 #  endif
1468         {
1469           // Disable visibility test any-hit program, since it is already checked during
1470           // intersection. Those trace calls that require anyhit can force it with a ray flag.
1471           build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
1472
1473           build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1474 #  if OPTIX_ABI_VERSION < 23
1475           build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1476           build_input.aabbArray.numPrimitives = num_segments;
1477           build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
1478           build_input.aabbArray.flags = &build_flags;
1479           build_input.aabbArray.numSbtRecords = 1;
1480           build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
1481 #  else
1482           build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1483           build_input.customPrimitiveArray.numPrimitives = num_segments;
1484           build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1485           build_input.customPrimitiveArray.flags = &build_flags;
1486           build_input.customPrimitiveArray.numSbtRecords = 1;
1487           build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
1488 #  endif
1489         }
1490
1491         if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1492           progress.set_error("Failed to build OptiX acceleration structure");
1493         }
1494       }
1495       else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
1496         // Build BLAS for triangle primitives
1497         Mesh *const mesh = static_cast<Mesh *const>(geom);
1498         if (mesh->num_triangles() == 0) {
1499           return;
1500         }
1501
1502         const size_t num_verts = mesh->get_verts().size();
1503
1504         size_t num_motion_steps = 1;
1505         Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1506         if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
1507           num_motion_steps = mesh->get_motion_steps();
1508         }
1509
1510         device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1511         index_data.alloc(mesh->get_triangles().size());
1512         memcpy(index_data.data(),
1513                mesh->get_triangles().data(),
1514                mesh->get_triangles().size() * sizeof(int));
1515         device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1516         vertex_data.alloc(num_verts * num_motion_steps);
1517
1518         for (size_t step = 0; step < num_motion_steps; ++step) {
1519           const float3 *verts = mesh->get_verts().data();
1520
1521           size_t center_step = (num_motion_steps - 1) / 2;
1522           // The center step for motion vertices is not stored in the attribute
1523           if (step != center_step) {
1524             verts = motion_keys->data_float3() +
1525                     (step > center_step ? step - 1 : step) * num_verts;
1526           }
1527
1528           memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
1529         }
1530
1531         // Upload triangle data to GPU
1532         index_data.copy_to_device();
1533         vertex_data.copy_to_device();
1534
1535         vector<device_ptr> vertex_ptrs;
1536         vertex_ptrs.reserve(num_motion_steps);
1537         for (size_t step = 0; step < num_motion_steps; ++step) {
1538           vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
1539         }
1540
1541         // Force a single any-hit call, so shadow record-all behavior works correctly
1542         unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1543         OptixBuildInput build_input = {};
1544         build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
1545         build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1546         build_input.triangleArray.numVertices = num_verts;
1547         build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
1548         build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
1549         build_input.triangleArray.indexBuffer = index_data.device_pointer;
1550         build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
1551         build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
1552         build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
1553         build_input.triangleArray.flags = &build_flags;
1554         // The SBT does not store per primitive data since Cycles already allocates separate
1555         // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
1556         // one and rely on that having the same meaning in this case.
1557         build_input.triangleArray.numSbtRecords = 1;
1558         build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
1559
1560         if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1561           progress.set_error("Failed to build OptiX acceleration structure");
1562         }
1563       }
1564     }
1565     else {
1566       unsigned int num_instances = 0;
1567       unsigned int max_num_instances = 0xFFFFFFFF;
1568
1569       bvh_optix->as_data.free();
1570       bvh_optix->traversable_handle = 0;
1571       bvh_optix->motion_transform_data.free();
1572
1573       optixDeviceContextGetProperty(context,
1574                                     OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
1575                                     &max_num_instances,
1576                                     sizeof(max_num_instances));
1577       // Do not count first bit, which is used to distinguish instanced and non-instanced objects
1578       max_num_instances >>= 1;
1579       if (bvh->objects.size() > max_num_instances) {
1580         progress.set_error(
1581             "Failed to build OptiX acceleration structure because there are too many instances");
1582         return;
1583       }
1584
1585       // Fill instance descriptions
1586 #  if OPTIX_ABI_VERSION < 41
1587       device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
1588       aabbs.alloc(bvh->objects.size());
1589 #  endif
1590       device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
1591       instances.alloc(bvh->objects.size());
1592
1593       // Calculate total motion transform size and allocate memory for them
1594       size_t motion_transform_offset = 0;
1595       if (motion_blur) {
1596         size_t total_motion_transform_size = 0;
1597         for (Object *const ob : bvh->objects) {
1598           if (ob->is_traceable() && ob->use_motion()) {
1599             total_motion_transform_size = align_up(total_motion_transform_size,
1600                                                    OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1601             const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
1602             total_motion_transform_size = total_motion_transform_size +
1603                                           sizeof(OptixSRTMotionTransform) +
1604                                           motion_keys * sizeof(OptixSRTData);
1605           }
1606         }
1607
1608         assert(bvh_optix->motion_transform_data.device == this);
1609         bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
1610       }
1611
1612       for (Object *ob : bvh->objects) {
1613         // Skip non-traceable objects
1614         if (!ob->is_traceable())
1615           continue;
1616
1617         BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
1618         OptixTraversableHandle handle = blas->traversable_handle;
1619
1620 #  if OPTIX_ABI_VERSION < 41
1621         OptixAabb &aabb = aabbs[num_instances];
1622         aabb.minX = ob->bounds.min.x;
1623         aabb.minY = ob->bounds.min.y;
1624         aabb.minZ = ob->bounds.min.z;
1625         aabb.maxX = ob->bounds.max.x;
1626         aabb.maxY = ob->bounds.max.y;
1627         aabb.maxZ = ob->bounds.max.z;
1628 #  endif
1629
1630         OptixInstance &instance = instances[num_instances++];
1631         memset(&instance, 0, sizeof(instance));
1632
1633         // Clear transform to identity matrix
1634         instance.transform[0] = 1.0f;
1635         instance.transform[5] = 1.0f;
1636         instance.transform[10] = 1.0f;
1637
1638         // Set user instance ID to object index (but leave low bit blank)
1639         instance.instanceId = ob->get_device_index() << 1;
1640
1641         // Have to have at least one bit in the mask, or else instance would always be culled
1642         instance.visibilityMask = 1;
1643
1644         if (ob->get_geometry()->has_volume) {
1645           // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
1646           instance.visibilityMask |= 2;
1647         }
1648
1649         if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
1650           // Same applies to curves (so they can be skipped in local trace calls)
1651           instance.visibilityMask |= 4;
1652
1653 #  if OPTIX_ABI_VERSION >= 36
1654           if (motion_blur && ob->get_geometry()->has_motion_blur() &&
1655               DebugFlags().optix.curves_api &&
1656               static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
1657             // Select between motion blur and non-motion blur built-in intersection module
1658             instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
1659           }
1660 #  endif
1661         }
1662
1663         // Insert motion traversable if object has motion
1664         if (motion_blur && ob->use_motion()) {
1665           size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
1666           size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
1667                                          motion_keys * sizeof(OptixSRTData);
1668
1669           const CUDAContextScope scope(cuContext);
1670
1671           motion_transform_offset = align_up(motion_transform_offset,
1672                                              OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1673           CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
1674                                              motion_transform_offset;
1675           motion_transform_offset += motion_transform_size;
1676
1677           // Allocate host side memory for motion transform and fill it with transform data
1678           OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
1679               new uint8_t[motion_transform_size]);
1680           motion_transform.child = handle;
1681           motion_transform.motionOptions.numKeys = ob->get_motion().size();
1682           motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
1683           motion_transform.motionOptions.timeBegin = 0.0f;
1684           motion_transform.motionOptions.timeEnd = 1.0f;
1685
1686           OptixSRTData *const srt_data = motion_transform.srtData;
1687           array<DecomposedTransform> decomp(ob->get_motion().size());
1688           transform_motion_decompose(
1689               decomp.data(), ob->get_motion().data(), ob->get_motion().size());
1690
1691           for (size_t i = 0; i < ob->get_motion().size(); ++i) {
1692             // Scale
1693             srt_data[i].sx = decomp[i].y.w;  // scale.x.x
1694             srt_data[i].sy = decomp[i].z.w;  // scale.y.y
1695             srt_data[i].sz = decomp[i].w.w;  // scale.z.z
1696
1697             // Shear
1698             srt_data[i].a = decomp[i].z.x;  // scale.x.y
1699             srt_data[i].b = decomp[i].z.y;  // scale.x.z
1700             srt_data[i].c = decomp[i].w.x;  // scale.y.z
1701             assert(decomp[i].z.z == 0.0f);  // scale.y.x
1702             assert(decomp[i].w.y == 0.0f);  // scale.z.x
1703             assert(decomp[i].w.z == 0.0f);  // scale.z.y
1704
1705             // Pivot point
1706             srt_data[i].pvx = 0.0f;
1707             srt_data[i].pvy = 0.0f;
1708             srt_data[i].pvz = 0.0f;
1709
1710             // Rotation
1711             srt_data[i].qx = decomp[i].x.x;
1712             srt_data[i].qy = decomp[i].x.y;
1713             srt_data[i].qz = decomp[i].x.z;
1714             srt_data[i].qw = decomp[i].x.w;
1715
1716             // Translation
1717             srt_data[i].tx = decomp[i].y.x;
1718             srt_data[i].ty = decomp[i].y.y;
1719             srt_data[i].tz = decomp[i].y.z;
1720           }
1721
1722           // Upload motion transform to GPU
1723           cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
1724           delete[] reinterpret_cast<uint8_t *>(&motion_transform);
1725
1726           // Disable instance transform if object uses motion transform already
1727           instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
1728
1729           // Get traversable handle to motion transform
1730           optixConvertPointerToTraversableHandle(context,
1731                                                  motion_transform_gpu,
1732                                                  OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
1733                                                  &instance.traversableHandle);
1734         }
1735         else {
1736           instance.traversableHandle = handle;
1737
1738           if (ob->get_geometry()->is_instanced()) {
1739             // Set transform matrix
1740             memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
1741           }
1742           else {
1743             // Disable instance transform if geometry already has it applied to vertex data
1744             instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
1745             // Non-instanced objects read ID from 'prim_object', so distinguish
1746             // them from instanced objects with the low bit set
1747             instance.instanceId |= 1;
1748           }
1749         }
1750       }
1751
1752       // Upload instance descriptions
1753 #  if OPTIX_ABI_VERSION < 41
1754       aabbs.resize(num_instances);
1755       aabbs.copy_to_device();
1756 #  endif
1757       instances.resize(num_instances);
1758       instances.copy_to_device();
1759
1760       // Build top-level acceleration structure (TLAS)
1761       OptixBuildInput build_input = {};
1762       build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
1763 #  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
1764       build_input.instanceArray.aabbs = aabbs.device_pointer;
1765       build_input.instanceArray.numAabbs = num_instances;
1766 #  endif
1767       build_input.instanceArray.instances = instances.device_pointer;
1768       build_input.instanceArray.numInstances = num_instances;
1769
1770       if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
1771         progress.set_error("Failed to build OptiX acceleration structure");
1772       }
1773       tlas_handle = bvh_optix->traversable_handle;
1774     }
1775   }
1776
1777   void release_optix_bvh(BVH *bvh) override
1778   {
1779     thread_scoped_lock lock(delayed_free_bvh_mutex);
1780     /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
1781      * while GPU is still rendering. */
1782     BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1783
1784     delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
1785     delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
1786     bvh_optix->traversable_handle = 0;
1787   }
1788
1789   void free_bvh_memory_delayed()
1790   {
1791     thread_scoped_lock lock(delayed_free_bvh_mutex);
1792     delayed_free_bvh_memory.free_memory();
1793   }
1794
1795   void const_copy_to(const char *name, void *host, size_t size) override
1796   {
1797     // Set constant memory for CUDA module
1798     // TODO(pmours): This is only used for tonemapping (see 'film_convert').
1799     //               Could be removed by moving those functions to filter CUDA module.
1800     CUDADevice::const_copy_to(name, host, size);
1801
1802     if (strcmp(name, "__data") == 0) {
1803       assert(size <= sizeof(KernelData));
1804
1805       // Update traversable handle (since it is different for each device on multi devices)
1806       KernelData *const data = (KernelData *)host;
1807       *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
1808
1809       update_launch_params(offsetof(KernelParams, data), host, size);
1810       return;
1811     }
1812
1813     // Update data storage pointers in launch parameters
1814 #  define KERNEL_TEX(data_type, tex_name) \
1815     if (strcmp(name, #tex_name) == 0) { \
1816       update_launch_params(offsetof(KernelParams, tex_name), host, size); \
1817       return; \
1818     }
1819 #  include "kernel/kernel_textures.h"
1820 #  undef KERNEL_TEX
1821   }
1822
1823   void update_launch_params(size_t offset, void *data, size_t data_size)
1824   {
1825     const CUDAContextScope scope(cuContext);
1826
1827     for (int i = 0; i < info.cpu_threads; ++i)
1828       check_result_cuda(
1829           cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
1830                        data,
1831                        data_size));
1832   }
1833
1834   void task_add(DeviceTask &task) override
1835   {
1836     // Upload texture information to device if it has changed since last launch
1837     load_texture_info();
1838
1839     if (task.type == DeviceTask::FILM_CONVERT) {
1840       // Execute in main thread because of OpenGL access
1841       film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
1842       return;
1843     }
1844
1845     if (task.type == DeviceTask::DENOISE_BUFFER) {
1846       // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
1847       task_pool.push([=] {
1848         DeviceTask task_copy = task;
1849         thread_run(task_copy, 0);
1850       });
1851       return;
1852     }
1853
1854     // Split task into smaller ones
1855     list<DeviceTask> tasks;
1856     task.split(tasks, info.cpu_threads);
1857
1858     // Queue tasks in internal task pool
1859     int task_index = 0;
1860     for (DeviceTask &task : tasks) {
1861       task_pool.push([=] {
1862         // Using task index parameter instead of thread index, since number of CUDA streams may
1863         // differ from number of threads
1864         DeviceTask task_copy = task;
1865         thread_run(task_copy, task_index);
1866       });
1867       task_index++;
1868     }
1869   }
1870
1871   void task_wait() override
1872   {
1873     // Wait for all queued tasks to finish
1874     task_pool.wait_work();
1875   }
1876
1877   void task_cancel() override
1878   {
1879     // Cancel any remaining tasks in the internal pool
1880     task_pool.cancel();
1881   }
1882 };
1883
1884 bool device_optix_init()
1885 {
1886   if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
1887     return true;  // Already initialized function table
1888
1889   // Need to initialize CUDA as well
1890   if (!device_cuda_init())
1891     return false;
1892
1893   const OptixResult result = optixInit();
1894
1895   if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
1896     VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
1897                "Please update to the latest driver first!";
1898     return false;
1899   }
1900   else if (result != OPTIX_SUCCESS) {
1901     VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
1902     return false;
1903   }
1904
1905   // Loaded OptiX successfully!
1906   return true;
1907 }
1908
1909 void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
1910 {
1911   devices.reserve(cuda_devices.size());
1912
1913   // Simply add all supported CUDA devices as OptiX devices again
1914   for (DeviceInfo info : cuda_devices) {
1915     assert(info.type == DEVICE_CUDA);
1916
1917     int major;
1918     cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
1919     if (major < 5) {
1920       continue;  // Only Maxwell and up are supported by OptiX
1921     }
1922
1923     info.type = DEVICE_OPTIX;
1924     info.id += "_OptiX";
1925     info.denoisers |= DENOISER_OPTIX;
1926     info.has_branched_path = false;
1927
1928     devices.push_back(info);
1929   }
1930 }
1931
1932 Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
1933 {
1934   return new OptiXDevice(info, stats, profiler, background);
1935 }
1936
1937 CCL_NAMESPACE_END
1938
1939 #endif