Cleanup: refactor adaptive sampling to more easily change some parameters
authorBrecht Van Lommel <brecht@blender.org>
Tue, 7 Apr 2020 17:43:51 +0000 (19:43 +0200)
committerBrecht Van Lommel <brecht@blender.org>
Tue, 7 Apr 2020 18:29:48 +0000 (20:29 +0200)
No functional changes yet, this is work towards making CPU and GPU results
match more closely.

intern/cycles/device/device.cpp
intern/cycles/device/device.h
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_network.cpp
intern/cycles/device/device_opencl.cpp
intern/cycles/device/device_task.cpp
intern/cycles/kernel/kernel_passes.h
intern/cycles/kernel/kernel_types.h
intern/cycles/render/integrator.cpp
intern/cycles/render/session.cpp

index d94d409175b22b39ece60c1f6a0f4eaaaca034ff..dfbf57e8b8847f0566bcae078cd67b240501df1a 100644 (file)
@@ -597,6 +597,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
 
   info.has_half_images = true;
   info.has_volume_decoupled = true;
+  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_profiling = true;
 
@@ -639,6 +640,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     /* Accumulate device info. */
     info.has_half_images &= device.has_half_images;
     info.has_volume_decoupled &= device.has_volume_decoupled;
+    info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
   }
index a98ac1717098de32aa33081184269baacfd5080a..c55dfb3a83b03d62840fe635b672f23db512db06 100644 (file)
@@ -75,12 +75,13 @@ class DeviceInfo {
   string description;
   string id; /* used for user preferences, should stay fixed with changing hardware config */
   int num;
-  bool display_device;       /* GPU is used as a display device. */
-  bool has_half_images;      /* Support half-float textures. */
-  bool has_volume_decoupled; /* Decoupled volume shading. */
-  bool has_osl;              /* Support Open Shading Language. */
-  bool use_split_kernel;     /* Use split or mega kernel. */
-  bool has_profiling;        /* Supports runtime collection of profiling info. */
+  bool display_device;               /* GPU is used as a display device. */
+  bool has_half_images;              /* Support half-float textures. */
+  bool has_volume_decoupled;         /* Decoupled volume shading. */
+  bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
+  bool has_osl;                      /* Support Open Shading Language. */
+  bool use_split_kernel;             /* Use split or mega kernel. */
+  bool has_profiling;                /* Supports runtime collection of profiling info. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
   vector<DeviceInfo> denoising_devices;
@@ -94,6 +95,7 @@ class DeviceInfo {
     display_device = false;
     has_half_images = false;
     has_volume_decoupled = false;
+    has_adaptive_stop_per_sample = false;
     has_osl = false;
     use_split_kernel = false;
     has_profiling = false;
index 57e8523e02a9a04a85f54a53abd51e38f0567172..c701c14318f8a41f7569178c812d8c6c637e5806 100644 (file)
@@ -839,7 +839,7 @@ class CPUDevice : public Device {
     return true;
   }
 
-  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile)
+  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
   {
     WorkTile wtile;
     wtile.x = tile.x;
@@ -850,11 +850,24 @@ class CPUDevice : public Device {
     wtile.stride = tile.stride;
     wtile.buffer = (float *)tile.buffer;
 
+    /* For CPU we do adaptive stopping per sample so we can stop earlier, but
+     * for combined CPU + GPU rendering we match the GPU and do it per tile
+     * after a given number of sample steps. */
+    if (!kernel_data.integrator.adaptive_stop_per_sample) {
+      for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
+        for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
+          const int index = wtile.offset + x + y * wtile.stride;
+          float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
+          kernel_do_adaptive_stopping(kg, buffer, sample);
+        }
+      }
+    }
+
     bool any = false;
-    for (int y = tile.y; y < tile.y + tile.h; ++y) {
+    for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
       any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
     }
-    for (int x = tile.x; x < tile.x + tile.w; ++x) {
+    for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
       any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
     }
     return (!any);
@@ -917,7 +930,7 @@ class CPUDevice : public Device {
       tile.sample = sample + 1;
 
       if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
-        const bool stop = adaptive_sampling_filter(kg, tile);
+        const bool stop = adaptive_sampling_filter(kg, tile, sample);
         if (stop) {
           const int num_progress_samples = end_sample - sample;
           tile.sample = end_sample;
@@ -1327,6 +1340,7 @@ void device_cpu_info(vector<DeviceInfo> &devices)
   info.id = "CPU";
   info.num = 0;
   info.has_volume_decoupled = true;
+  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_half_images = true;
   info.has_profiling = true;
index 9a703b45c0adea19aae2efaffe31fdf9a4b8029d..4a53fcd151da942504857ee4c7549763f75ac001 100644 (file)
@@ -129,6 +129,7 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
     info.has_half_images = (major >= 3);
     info.has_volume_decoupled = false;
+    info.has_adaptive_stop_per_sample = false;
 
     int pci_location[3] = {0, 0, 0};
     cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
index 2742cbf53aaa317c1c991a1d4589c59808978532..0933d51f3210b2bd6b7c475712a90f9134a9d34e 100644 (file)
@@ -311,6 +311,7 @@ void device_network_info(vector<DeviceInfo> &devices)
 
   /* todo: get this info from device */
   info.has_volume_decoupled = false;
+  info.has_adaptive_stop_per_sample = false;
   info.has_osl = false;
 
   devices.push_back(info);
index 891b73351a050309db8ec0198d6b7dca335f1325..8a0b128697f3bbba5a4acb91ecfcbe5d486215d6 100644 (file)
@@ -119,6 +119,7 @@ void device_opencl_info(vector<DeviceInfo> &devices)
     info.display_device = true;
     info.use_split_kernel = true;
     info.has_volume_decoupled = false;
+    info.has_adaptive_stop_per_sample = false;
     info.id = id;
 
     /* Check OpenCL extensions */
index c36b1344c3bea938bef6bf890c4b09580641f51e..d2447eae867c7a7fb322c19f43346005a320837f 100644 (file)
@@ -138,8 +138,7 @@ void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 
 /* Adaptive Sampling */
 
-AdaptiveSampling::AdaptiveSampling()
-    : use(true), adaptive_step(ADAPTIVE_SAMPLE_STEP), min_samples(0)
+AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
 {
 }
 
index 98136bc7047f9df3a6e9ac25263fd1fd747cd964..7437e540a1f457678e6d5e2145ce30b5b0a63793 100644 (file)
@@ -403,9 +403,13 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
                                make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
     }
 #ifdef __KERNEL_CPU__
-    if (sample > kernel_data.integrator.adaptive_min_samples &&
-        (sample & (ADAPTIVE_SAMPLE_STEP - 1)) == (ADAPTIVE_SAMPLE_STEP - 1)) {
-      kernel_do_adaptive_stopping(kg, buffer, sample);
+    if ((sample > kernel_data.integrator.adaptive_min_samples) &&
+        kernel_data.integrator.adaptive_stop_per_sample) {
+      const int step = kernel_data.integrator.adaptive_step;
+
+      if ((sample & (step - 1)) == (step - 1)) {
+        kernel_do_adaptive_stopping(kg, buffer, sample);
+      }
     }
 #endif
   }
index 44c936da626530fb63a905b4181694a6f8064b96..a1f8c35348dac6a4bc805a0e60cb420d6fc64536 100644 (file)
@@ -63,11 +63,6 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE 32
 
-/* Adaptive sampling constants */
-#define ADAPTIVE_SAMPLE_STEP 4
-static_assert((ADAPTIVE_SAMPLE_STEP & (ADAPTIVE_SAMPLE_STEP - 1)) == 0,
-              "ADAPTIVE_SAMPLE_STEP must be power of two for bitwise operations to work");
-
 /* Split kernel constants */
 #define WORK_POOL_SIZE_GPU 64
 #define WORK_POOL_SIZE_CPU 1
@@ -1350,6 +1345,8 @@ typedef struct KernelIntegrator {
   int sampling_pattern;
   int aa_samples;
   int adaptive_min_samples;
+  int adaptive_step;
+  int adaptive_stop_per_sample;
   float adaptive_threshold;
 
   /* volume render */
@@ -1362,7 +1359,7 @@ typedef struct KernelIntegrator {
 
   int max_closures;
 
-  int pad1, pad2, pad3;
+  int pad1;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
index 2f9d088899edba4a2d388280a185a0697448b23f..d4beb06e57bebeba237ac08a66b0df7f30a69b4a 100644 (file)
@@ -190,6 +190,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   else {
     kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
   }
+
+  kintegrator->adaptive_step = 4;
+  kintegrator->adaptive_stop_per_sample = device->info.has_adaptive_stop_per_sample;
+
+  /* Adaptive step must be a power of two for bitwise operations to work. */
+  assert((kintegrator->adaptive_step & (kintegrator->adaptive_step - 1)) == 0);
+
   if (aa_samples > 0 && adaptive_threshold == 0.0f) {
     kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
     VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
index b1b30979b0e2126355c321d89c13dd334ac3d3bd..58bcc7ccdfbe0530902ac25a1fe7df9d6744501d 100644 (file)
@@ -1110,6 +1110,7 @@ void Session::render(bool with_denoising)
   task.adaptive_sampling.use = (scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ) &&
                                scene->dscene.data.film.pass_adaptive_aux_buffer;
   task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
+  task.adaptive_sampling.adaptive_step = scene->dscene.data.integrator.adaptive_step;
 
   /* Acquire render tiles by default. */
   task.tile_types = RenderTile::PATH_TRACE;