Cycles: Add sample-based runtime profiler that measures time spent in various parts...
authorLukas Stockner <lukas.stockner@freenet.de>
Thu, 29 Nov 2018 01:06:30 +0000 (02:06 +0100)
committerLukas Stockner <lukas.stockner@freenet.de>
Thu, 29 Nov 2018 01:45:24 +0000 (02:45 +0100)
This commit adds a sample-based profiler that runs during CPU rendering and collects statistics on time spent in different parts of the kernel (ray intersection, shader evaluation etc.) as well as time spent per material and object.

The results are currently not exposed in the user interface or per Python yet, to see the stats on the console pass the "--cycles-print-stats" argument to Cycles (e.g. "./blender -- --cycles-print-stats").

Unfortunately, there is no clear way to extend this functionality to CUDA or OpenCL, so it is CPU-only for now.

Reviewers: brecht, sergey, swerner

Reviewed By: brecht, swerner

Differential Revision: https://developer.blender.org/D3892

26 files changed:
intern/cycles/blender/blender_session.cpp
intern/cycles/blender/blender_sync.cpp
intern/cycles/device/device.cpp
intern/cycles/device/device.h
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_denoising.cpp
intern/cycles/device/device_denoising.h
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/bvh/bvh.h
intern/cycles/kernel/kernel_globals.h
intern/cycles/kernel/kernel_passes.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_subsurface.h
intern/cycles/kernel/kernel_path_surface.h
intern/cycles/kernel/kernel_profiling.h [new file with mode: 0644]
intern/cycles/kernel/kernel_shader.h
intern/cycles/render/object.cpp
intern/cycles/render/object.h
intern/cycles/render/session.cpp
intern/cycles/render/session.h
intern/cycles/render/stats.cpp
intern/cycles/render/stats.h
intern/cycles/util/CMakeLists.txt
intern/cycles/util/util_profiling.cpp [new file with mode: 0644]
intern/cycles/util/util_profiling.h [new file with mode: 0644]
intern/cycles/util/util_stats.h

index 75c7dcee05e9a023b9886e45a69045b70713912d..30ae0bc813dcb24c3f85936da593f56d618b42e2 100644 (file)
@@ -516,7 +516,7 @@ void BlenderSession::render()
 
                if(!b_engine.is_preview() && background && print_render_stats) {
                        RenderStats stats;
-                       session->scene->collect_statistics(&stats);
+                       session->collect_statistics(&stats);
                        printf("Render statistics:\n%s\n", stats.full_report().c_str());
                }
 
index 832847c179fb7e844301d762a73aa999379af7e5..49b046d0a881c43ffbbaa78e29f0584b4978a511 100644 (file)
@@ -874,7 +874,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
        }
 
        /* tiles */
-       if(params.device.type != DEVICE_CPU && !background) {
+       const bool is_cpu = (params.device.type == DEVICE_CPU);
+       if(!is_cpu && !background) {
                /* currently GPU could be much slower than CPU when using tiles,
                 * still need to be investigated, but meanwhile make it possible
                 * to work in viewport smoothly
@@ -960,6 +961,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
                params.progressive_update_timeout = 0.1;
        }
 
+       params.use_profiling = params.device.has_profiling && !b_engine.is_preview() &&
+                              background && BlenderSession::print_render_stats;
+
        return params;
 }
 
index 7e20bb449c3be384bff52637aa0812f558a3af15..54ffd4bc4df4d6849cea4e92c5af7f1247421619 100644 (file)
@@ -362,6 +362,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
        info.has_half_images = true;
        info.has_volume_decoupled = true;
        info.has_osl = true;
+       info.has_profiling = true;
 
        foreach(const DeviceInfo &device, subdevices) {
                /* Ensure CPU device does not slow down GPU. */
@@ -396,6 +397,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
                info.has_half_images &= device.has_half_images;
                info.has_volume_decoupled &= device.has_volume_decoupled;
                info.has_osl &= device.has_osl;
+               info.has_profiling &= device.has_profiling;
        }
 
        return info;
index f3fb338e63872a262e645f7f72736f06fb4c56eb..071f61a7566474727a5877309e032d4c12718a41 100644 (file)
@@ -60,6 +60,7 @@ public:
        bool has_volume_decoupled;      /* Decoupled volume shading. */
        bool has_osl;                   /* Support Open Shading Language. */
        bool use_split_kernel;          /* Use split or mega kernel. */
+       bool has_profiling;             /* Supports runtime collection of profiling info. */
        int cpu_threads;
        vector<DeviceInfo> multi_devices;
 
@@ -75,6 +76,7 @@ public:
                has_volume_decoupled = false;
                has_osl = false;
                use_split_kernel = false;
+               has_profiling = false;
        }
 
        bool operator==(const DeviceInfo &info) {
index 36fe9bfc92b5f3f70d19cc53860875c99dd99c71..f0a6fd6e3f4d8dcea421c7004106135c44edb0a8 100644 (file)
@@ -477,6 +477,8 @@ public:
        bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
                                       DenoisingTask *task)
        {
+               ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
+
                int4 rect = task->rect;
                int   r   = task->nlm_state.r;
                int   f   = task->nlm_state.f;
@@ -529,6 +531,8 @@ public:
 
        bool denoising_construct_transform(DenoisingTask *task)
        {
+               ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
+
                for(int y = 0; y < task->filter_area.w; y++) {
                        for(int x = 0; x < task->filter_area.z; x++) {
                                filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
@@ -551,6 +555,8 @@ public:
                                   device_ptr output_ptr,
                                   DenoisingTask *task)
        {
+               ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
+
                mem_zero(task->storage.XtWX);
                mem_zero(task->storage.XtWY);
 
@@ -609,8 +615,10 @@ public:
 
        bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
                                      device_ptr mean_ptr, device_ptr variance_ptr,
-                                     int r, int4 rect, DenoisingTask * /*task*/)
+                                     int r, int4 rect, DenoisingTask *task)
        {
+               ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
+
                for(int y = rect.y; y < rect.w; y++) {
                        for(int x = rect.x; x < rect.z; x++) {
                                filter_combine_halves_kernel()(x, y,
@@ -629,6 +637,8 @@ public:
                                     device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
                                     device_ptr buffer_variance_ptr, DenoisingTask *task)
        {
+               ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
+
                for(int y = task->rect.y; y < task->rect.w; y++) {
                        for(int x = task->rect.x; x < task->rect.z; x++) {
                                filter_divide_shadow_kernel()(task->render_buffer.samples,
@@ -653,6 +663,8 @@ public:
                                   device_ptr variance_ptr,
                                   DenoisingTask *task)
        {
+               ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
+
                for(int y = task->rect.y; y < task->rect.w; y++) {
                        for(int x = task->rect.x; x < task->rect.z; x++) {
                                filter_get_feature_kernel()(task->render_buffer.samples,
@@ -676,6 +688,8 @@ public:
                                       device_ptr output_ptr,
                                       DenoisingTask *task)
        {
+               ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
+
                for(int y = task->rect.y; y < task->rect.w; y++) {
                        for(int x = task->rect.x; x < task->rect.z; x++) {
                                filter_detect_outliers_kernel()(x, y,
@@ -735,6 +749,8 @@ public:
 
        void denoise(DenoisingTask& denoising, RenderTile &tile)
        {
+               ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
+
                tile.sample = tile.start_sample + tile.num_samples;
 
                denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
@@ -765,6 +781,8 @@ public:
 
                KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
 
+               stats.profiler.add_state(&kg->profiler);
+
                CPUSplitKernel *split_kernel = NULL;
                if(use_split_kernel) {
                        split_kernel = new CPUSplitKernel(this);
@@ -778,6 +796,7 @@ public:
 
                RenderTile tile;
                DenoisingTask denoising(this, task);
+               denoising.profiler = &kg->profiler;
 
                while(task.acquire_tile(this, tile)) {
                        if(tile.task == RenderTile::PATH_TRACE) {
@@ -802,6 +821,8 @@ public:
                        }
                }
 
+               stats.profiler.remove_state(&kg->profiler);
+
                thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
                kg->~KernelGlobals();
                kgbuffer.free();
@@ -1061,6 +1082,7 @@ void device_cpu_info(vector<DeviceInfo>& devices)
        info.has_volume_decoupled = true;
        info.has_osl = true;
        info.has_half_images = true;
+       info.has_profiling = true;
 
        devices.insert(devices.begin(), info);
 }
index 78c65a3d22d5b1e8ffeaf6a93b85a48f6683f1f3..433cbd3c265a3a72e156e70d1f7c36a9f41173e8 100644 (file)
@@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN
 
 DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
 : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
+  profiler(NULL),
   storage(device),
   buffer(device),
   device(device)
index 8e0666d0e59cce05648077d00a8ceb11d4c93fd0..beae60c220faa7084848a15ea32c47a236d89855 100644 (file)
@@ -23,6 +23,8 @@
 
 #include "kernel/filter/filter_defines.h"
 
+#include "util/util_profiling.h"
+
 CCL_NAMESPACE_BEGIN
 
 class DenoisingTask {
@@ -51,6 +53,8 @@ public:
        TileInfo *tile_info;
        device_vector<int> tile_info_mem;
 
+       ProfilingState *profiler;
+
        int4 rect;
        int4 filter_area;
 
index 92cb66bdec91ada53940dfc44b7898fc88229bc0..d4145225b77ca6fa175e02226bcef61f15ef7e9c 100644 (file)
@@ -110,6 +110,7 @@ set(SRC_HEADERS
        kernel_path_surface.h
        kernel_path_subsurface.h
        kernel_path_volume.h
+       kernel_profiling.h
        kernel_projection.h
        kernel_queues.h
        kernel_random.h
index 6708a3efac12da6151cd65ec03768e7de19e5632..284b1e9208cf0beb25386436b91b85d7ca18638b 100644 (file)
@@ -186,6 +186,8 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
                                           float difl,
                                           float extmax)
 {
+       PROFILING_INIT(kg, PROFILING_INTERSECT);
+
        if(!scene_intersect_valid(&ray)) {
                return false;
        }
@@ -248,6 +250,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
+       PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
+
        if(!scene_intersect_valid(&ray)) {
                return false;
        }
@@ -327,6 +331,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
+       PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
+
        if(!scene_intersect_valid(ray)) {
                return false;
        }
@@ -407,6 +413,8 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
+       PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
+
        if(!scene_intersect_valid(ray)) {
                return false;
        }
@@ -438,6 +446,8 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
+       PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL);
+
        if(!scene_intersect_valid(ray)) {
                return false;
        }
index 37402f428634d5b68c817ee1a6f13a4fe769da03..59f1e252d21044e8c3cac14441a44a174b09735c 100644 (file)
@@ -19,6 +19,8 @@
 #ifndef __KERNEL_GLOBALS_H__
 #define __KERNEL_GLOBALS_H__
 
+#include "kernel/kernel_profiling.h"
+
 #ifdef __KERNEL_CPU__
 #  include "util/util_vector.h"
 #  include "util/util_map.h"
@@ -82,6 +84,8 @@ typedef struct KernelGlobals {
 
        int2 global_size;
        int2 global_id;
+
+       ProfilingState profiler;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
index 1f5929e49388d69502d616b499ef930e26965154..08e9db05c39dc41f574a0378c57deca6d2727fde 100644 (file)
@@ -376,6 +376,9 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
                                            int sample,
                                            PathRadiance *L)
 {
+       PROFILING_INIT(kg, PROFILING_WRITE_RESULT);
+       PROFILING_OBJECT(PRIM_NONE);
+
        float alpha;
        float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
 
index cb1f410b09f031a793ac830bc3cb83ce77bc4aa2..a1fc6028293912e218542f3bba2e636d02d82174 100644 (file)
@@ -57,6 +57,8 @@ ccl_device_forceinline bool kernel_path_scene_intersect(
        Intersection *isect,
        PathRadiance *L)
 {
+       PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT);
+
        uint visibility = path_state_ray_visibility(kg, state);
 
        if(path_state_ao_bounce(kg, state)) {
@@ -105,6 +107,8 @@ ccl_device_forceinline void kernel_path_lamp_emission(
        ShaderData *emission_sd,
        PathRadiance *L)
 {
+       PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION);
+
 #ifdef __LAMP_MIS__
        if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
                /* ray starting from previous non-transparent bounce */
@@ -172,6 +176,8 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
        ShaderData *emission_sd,
        PathRadiance *L)
 {
+       PROFILING_INIT(kg, PROFILING_VOLUME);
+
        /* Sanitize volume stack. */
        if(!hit) {
                kernel_volume_clean_stack(kg, state->volume_stack);
@@ -278,6 +284,8 @@ ccl_device_forceinline bool kernel_path_shader_apply(
        PathRadiance *L,
        ccl_global float *buffer)
 {
+       PROFILING_INIT(kg, PROFILING_SHADER_APPLY);
+
 #ifdef __SHADOW_TRICKS__
        if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
                if(state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
@@ -355,6 +363,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         float3 throughput,
                                         float3 ao_alpha)
 {
+       PROFILING_INIT(kg, PROFILING_AO);
+
        /* todo: solve correlation */
        float bsdf_u, bsdf_v;
 
@@ -568,6 +578,8 @@ ccl_device_forceinline void kernel_path_integrate(
        ccl_global float *buffer,
        ShaderData *emission_sd)
 {
+       PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE);
+
        /* Shader data memory used for both volumes and surfaces, saves stack space. */
        ShaderData sd;
 
@@ -719,6 +731,8 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
        ccl_global float *buffer,
        int sample, int x, int y, int offset, int stride)
 {
+       PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
        /* buffer offset */
        int index = offset + x + y*stride;
        int pass_stride = kernel_data.film.pass_stride;
index ff4a8a9d5800481f2d4ba754ed53ef3cf8ee3ab3..962776f21c17a60b98472211907ad4960b3fc44d 100644 (file)
@@ -32,6 +32,8 @@ bool kernel_path_subsurface_scatter(
         ccl_addr_space float3 *throughput,
         ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
 {
+       PROFILING_INIT(kg, PROFILING_SUBSURFACE);
+
        float bssrdf_u, bssrdf_v;
        path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 
index c9202ccb16d1c29f6b53188000fc835f0be0bb87..0d18a1e8c77798c05c1153e838fec15d17e0dd6b 100644 (file)
@@ -217,6 +217,8 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
        ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
        PathRadiance *L)
 {
+       PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT);
+
 #ifdef __EMISSION__
        if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
                return;
@@ -274,6 +276,8 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
                                            PathRadianceState *L_state,
                                            ccl_addr_space Ray *ray)
 {
+       PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE);
+
        /* no BSDF? we can stop here */
        if(sd->flag & SD_BSDF) {
                /* sample BSDF */
diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h
new file mode 100644 (file)
index 0000000..a46d637
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_PROFILING_H__
+#define __KERNEL_PROFILING_H__
+
+#ifdef __KERNEL_CPU__
+#  include "util/util_profiling.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_CPU__
+#  define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event)
+#  define PROFILING_EVENT(event) profiling_helper.set_event(event)
+#  define PROFILING_SHADER(shader) if((shader) != SHADER_NONE) { profiling_helper.set_shader((shader) & SHADER_MASK); }
+#  define PROFILING_OBJECT(object) if((object) != PRIM_NONE) { profiling_helper.set_object(object); }
+#else
+#  define PROFILING_INIT(kg, event)
+#  define PROFILING_EVENT(event)
+#  define PROFILING_SHADER(shader)
+#  define PROFILING_OBJECT(object)
+#endif  /* __KERNEL_CPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_PROFILING_H__ */
index af883aa715bc98f87bb2247308b607bf6babe120..4b2e675bb218481e8222a8b673686c4a924a806c 100644 (file)
@@ -54,6 +54,8 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
                                                const Intersection *isect,
                                                const Ray *ray)
 {
+       PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 #ifdef __INSTANCING__
        sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
@@ -147,6 +149,9 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
        differential_incoming(&sd->dI, ray->dD);
        differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
+
+       PROFILING_SHADER(sd->shader);
+       PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup from BSSRDF scatter */
@@ -163,6 +168,8 @@ void shader_setup_from_subsurface(
         const Intersection *isect,
         const Ray *ray)
 {
+       PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
        const bool backfacing = sd->flag & SD_BACKFACING;
 
        /* object, matrices, time, ray_length stay the same */
@@ -233,6 +240,8 @@ void shader_setup_from_subsurface(
        differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
        /* don't modify dP and dI */
 #  endif
+
+       PROFILING_SHADER(sd->shader);
 }
 #endif
 
@@ -249,6 +258,8 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 bool object_space,
                                                 int lamp)
 {
+       PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
        /* vectors */
        sd->P = P;
        sd->N = Ng;
@@ -353,6 +364,9 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
        sd->du = differential_zero();
        sd->dv = differential_zero();
 #endif
+
+       PROFILING_SHADER(sd->shader);
+       PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup for displacement */
@@ -380,6 +394,8 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
+       PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
        /* vectors */
        sd->P = ray->D;
        sd->N = -ray->D;
@@ -414,6 +430,9 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
        sd->du = differential_zero();
        sd->dv = differential_zero();
 #endif
+
+       PROFILING_SHADER(sd->shader);
+       PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup from point inside volume */
@@ -421,6 +440,8 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 #ifdef __VOLUME__
 ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
+       PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
        /* vectors */
        sd->P = ray->P;
        sd->N = -ray->D;
@@ -461,6 +482,9 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
        /* for NDC coordinates */
        sd->ray_P = ray->P;
        sd->ray_dP = ray->dP;
+
+       PROFILING_SHADER(sd->shader);
+       PROFILING_OBJECT(sd->object);
 }
 #endif  /* __VOLUME__ */
 
@@ -591,6 +615,8 @@ void shader_bsdf_eval(KernelGlobals *kg,
                       float light_pdf,
                       bool use_mis)
 {
+       PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL);
+
        bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
 
 #ifdef __BRANCHED_PATH__
@@ -720,6 +746,8 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
                                          differential3 *domega_in,
                                          float *pdf)
 {
+       PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+
        const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
        if(sc == NULL) {
                *pdf = 0.0f;
@@ -751,6 +779,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
        const ShaderClosure *sc, float randu, float randv, BsdfEval *bsdf_eval,
        float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+       PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+
        int label;
        float3 eval;
 
@@ -984,6 +1014,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
        ccl_addr_space PathState *state, int path_flag)
 {
+       PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
+
        /* If path is being terminated, we are tracing a shadow ray or evaluating
         * emission, then we don't need to store closures. The emission and shadow
         * shader data also do not have a closure array to save GPU memory. */
@@ -1084,6 +1116,8 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 ccl_device void shader_volume_phase_eval(KernelGlobals *kg, const ShaderData *sd,
        const float3 omega_in, BsdfEval *eval, float *pdf)
 {
+       PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL);
+
        bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
 
        _shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
@@ -1093,6 +1127,8 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s
        float randu, float randv, BsdfEval *phase_eval,
        float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+       PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
+
        int sampled = 0;
 
        if(sd->num_closure > 1) {
@@ -1151,6 +1187,8 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
        const ShaderClosure *sc, float randu, float randv, BsdfEval *phase_eval,
        float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+       PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
+
        int label;
        float3 eval;
 
index dc7a1043208f3d41cad95859cd67d1cc21a5b7d7..b34d16c438bf92532cc676f5237343221a272b2c 100644 (file)
@@ -335,6 +335,11 @@ uint Object::visibility_for_tracing() const {
        return trace_visibility;
 }
 
+int Object::get_device_index() const
+{
+       return index;
+}
+
 /* Object Manager */
 
 ObjectManager::ObjectManager()
@@ -348,10 +353,9 @@ ObjectManager::~ObjectManager()
 }
 
 void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state,
-                                                   Object *ob,
-                                                   int object_index)
+                                                   Object *ob)
 {
-       KernelObject& kobject = state->objects[object_index];
+       KernelObject& kobject = state->objects[ob->index];
        Transform *object_motion_pass = state->object_motion_pass;
 
        Mesh *mesh = ob->mesh;
@@ -457,13 +461,13 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
                        tfm_post = tfm_post * itfm;
                }
 
-               int motion_pass_offset = object_index*OBJECT_MOTION_PASS_SIZE;
+               int motion_pass_offset = ob->index*OBJECT_MOTION_PASS_SIZE;
                object_motion_pass[motion_pass_offset + 0] = tfm_pre;
                object_motion_pass[motion_pass_offset + 1] = tfm_post;
        }
        else if(state->need_motion == Scene::MOTION_BLUR) {
                if(ob->use_motion()) {
-                       kobject.motion_offset = state->motion_offset[object_index];
+                       kobject.motion_offset = state->motion_offset[ob->index];
 
                        /* Decompose transforms for interpolation. */
                        DecomposedTransform *decomp = state->object_motion + kobject.motion_offset;
@@ -494,7 +498,7 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
        if(ob->use_holdout) {
                flag |= SD_OBJECT_HOLDOUT_MASK;
        }
-       state->object_flag[object_index] = flag;
+       state->object_flag[ob->index] = flag;
 
        /* Have curves. */
        if(mesh->num_curves()) {
@@ -538,7 +542,7 @@ void ObjectManager::device_update_object_transform_task(
                for(int i = 0; i < num_objects; ++i) {
                        const int object_index = start_index + i;
                        Object *ob = state->scene->objects[object_index];
-                       device_update_object_transform(state, ob, object_index);
+                       device_update_object_transform(state, ob);
                }
        }
 }
@@ -593,10 +597,8 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene,
         * need some tweaks to make mid-complex scenes optimal.
         */
        if(scene->objects.size() < 64) {
-               int object_index = 0;
                foreach(Object *ob, scene->objects) {
-                       device_update_object_transform(&state, ob, object_index);
-                       object_index++;
+                       device_update_object_transform(&state, ob);
                        if(progress.get_cancel()) {
                                return;
                        }
@@ -642,6 +644,12 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
        if(scene->objects.size() == 0)
                return;
 
+       /* Assign object IDs. */
+       int index = 0;
+       foreach(Object *object, scene->objects) {
+               object->index = index++;
+       }
+
        /* set object transform matrices, before applying static transforms */
        progress.set_status("Updating Objects", "Copying Transformations to device");
        device_update_transforms(dscene, scene, progress);
@@ -686,26 +694,25 @@ void ObjectManager::device_update_flags(Device *,
                }
        }
 
-       int object_index = 0;
        foreach(Object *object, scene->objects) {
                if(object->mesh->has_volume) {
-                       object_flag[object_index] |= SD_OBJECT_HAS_VOLUME;
-                       object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
+                       object_flag[object->index] |= SD_OBJECT_HAS_VOLUME;
+                       object_flag[object->index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
 
                        foreach(Attribute& attr, object->mesh->attributes.attributes) {
                                if(attr.element == ATTR_ELEMENT_VOXEL) {
-                                       object_flag[object_index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
+                                       object_flag[object->index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
                                }
                        }
                }
                else {
-                       object_flag[object_index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
+                       object_flag[object->index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
                }
                if(object->is_shadow_catcher) {
-                       object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER;
+                       object_flag[object->index] |= SD_OBJECT_SHADOW_CATCHER;
                }
                else {
-                       object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER;
+                       object_flag[object->index] &= ~SD_OBJECT_SHADOW_CATCHER;
                }
 
                if(bounds_valid) {
@@ -714,7 +721,7 @@ void ObjectManager::device_update_flags(Device *,
                                        continue;
                                }
                                if(object->bounds.intersects(volume_object->bounds)) {
-                                       object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+                                       object_flag[object->index] |= SD_OBJECT_INTERSECTS_VOLUME;
                                        break;
                                }
                        }
@@ -723,9 +730,8 @@ void ObjectManager::device_update_flags(Device *,
                        /* Not really valid, but can't make more reliable in the case
                         * of bounds not being up to date.
                         */
-                       object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+                       object_flag[object->index] |= SD_OBJECT_INTERSECTS_VOLUME;
                }
-               ++object_index;
        }
 
        /* Copy object flag. */
@@ -741,7 +747,6 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc
        KernelObject *kobjects = dscene->objects.data();
 
        bool update = false;
-       int object_index = 0;
 
        foreach(Object *object, scene->objects) {
                Mesh* mesh = object->mesh;
@@ -750,18 +755,16 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc
                        uint patch_map_offset = 2*(mesh->patch_table_offset + mesh->patch_table->total_size() -
                                                   mesh->patch_table->num_nodes * PATCH_NODE_SIZE) - mesh->patch_offset;
 
-                       if(kobjects[object_index].patch_map_offset != patch_map_offset) {
-                               kobjects[object_index].patch_map_offset = patch_map_offset;
+                       if(kobjects[object->index].patch_map_offset != patch_map_offset) {
+                               kobjects[object->index].patch_map_offset = patch_map_offset;
                                update = true;
                        }
                }
 
-               if(kobjects[object_index].attribute_map_offset != mesh->attr_map_offset) {
-                       kobjects[object_index].attribute_map_offset = mesh->attr_map_offset;
+               if(kobjects[object->index].attribute_map_offset != mesh->attr_map_offset) {
+                       kobjects[object->index].attribute_map_offset = mesh->attr_map_offset;
                        update = true;
                }
-
-               object_index++;
        }
 
        if(update) {
index 87e6e6652ad3b37cdef192eb8f1abd4151ea51dd..134f0bc357726677dfce47402304f5038a6c2aab 100644 (file)
@@ -38,6 +38,7 @@ class Progress;
 class Scene;
 struct Transform;
 struct UpdateObjectTransformState;
+class ObjectManager;
 
 /* Object */
 
@@ -88,6 +89,16 @@ public:
         * determined flags which denotes trace-time visibility.
         */
        uint visibility_for_tracing() const;
+
+       /* Returns the index that is used in the kernel for this object. */
+       int get_device_index() const;
+
+protected:
+       /* Specifies the position of the object in scene->objects and
+        * in the device vectors. Gets set in device_update. */
+       int index;
+
+       friend class ObjectManager;
 };
 
 /* Object Manager */
@@ -123,8 +134,7 @@ public:
 
 protected:
        void device_update_object_transform(UpdateObjectTransformState *state,
-                                           Object *ob,
-                                           const int object_index);
+                                           Object *ob);
        void device_update_object_transform_task(UpdateObjectTransformState *state);
        bool device_update_object_transform_pop_work(
                UpdateObjectTransformState *state,
index d6ecafa19b7cfe357a7e3744ba49e14a3e932553..a8374b662d0088e666e670a60fbec78000baeb7a 100644 (file)
@@ -250,7 +250,9 @@ void Session::run_gpu()
                if(!no_tiles) {
                        /* update scene */
                        scoped_timer update_timer;
-                       update_scene();
+                       if(update_scene()) {
+                               stats.profiler.reset(scene->shaders.size(), scene->objects.size());
+                       }
                        progress.add_skip_time(update_timer, params.background);
 
                        if(!device->error_message().empty())
@@ -585,7 +587,9 @@ void Session::run_cpu()
 
                        /* update scene */
                        scoped_timer update_timer;
-                       update_scene();
+                       if(update_scene()) {
+                               stats.profiler.reset(scene->shaders.size(), scene->objects.size());
+                       }
                        progress.add_skip_time(update_timer, params.background);
 
                        if(!device->error_message().empty())
@@ -729,6 +733,10 @@ void Session::run()
        /* load kernels */
        load_kernels();
 
+       if(params.use_profiling && (params.device.type == DEVICE_CPU)) {
+               stats.profiler.start();
+       }
+
        /* session thread loop */
        progress.set_status("Waiting for render to start");
 
@@ -743,6 +751,8 @@ void Session::run()
                        run_cpu();
        }
 
+       stats.profiler.stop();
+
        /* progress update */
        if(progress.get_cancel())
                progress.set_status("Cancel", progress.get_cancel_message());
@@ -825,7 +835,7 @@ void Session::wait()
        session_thread = NULL;
 }
 
-void Session::update_scene()
+bool Session::update_scene()
 {
        thread_scoped_lock scene_lock(scene->mutex);
 
@@ -876,7 +886,10 @@ void Session::update_scene()
 
                progress.set_status("Updating Scene");
                MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
+
+               return true;
        }
+       return false;
 }
 
 void Session::update_status_time(bool show_pause, bool show_done)
@@ -1052,6 +1065,14 @@ void Session::device_free()
         */
 }
 
+void Session::collect_statistics(RenderStats *render_stats)
+{
+       scene->collect_statistics(render_stats);
+       if(params.use_profiling && (params.device.type == DEVICE_CPU)) {
+               render_stats->collect_profiling(scene, &stats);
+       }
+}
+
 int Session::get_max_closure_count()
 {
        int max_closures = 0;
index 56a69919a7a80be4c382c3f93a7b3993aeb5d30c..db07c75842740ee0952bc94233b2fab4a02dfa08 100644 (file)
@@ -20,6 +20,7 @@
 #include "render/buffers.h"
 #include "device/device.h"
 #include "render/shader.h"
+#include "render/stats.h"
 #include "render/tile.h"
 
 #include "util/util_progress.h"
@@ -55,6 +56,8 @@ public:
        int pixel_size;
        int threads;
 
+       bool use_profiling;
+
        bool display_buffer_linear;
 
        bool use_denoising;
@@ -89,6 +92,8 @@ public:
                pixel_size = 1;
                threads = 0;
 
+               use_profiling = false;
+
                use_denoising = false;
                denoising_passes = false;
                denoising_radius = 8;
@@ -118,6 +123,7 @@ public:
                && start_resolution == params.start_resolution
                && pixel_size == params.pixel_size
                && threads == params.threads
+               && use_profiling == params.use_profiling
                && display_buffer_linear == params.display_buffer_linear
                && cancel_timeout == params.cancel_timeout
                && reset_timeout == params.reset_timeout
@@ -159,7 +165,7 @@ public:
        void set_samples(int samples);
        void set_pause(bool pause);
 
-       void update_scene();
+       bool update_scene();
        void load_kernels(bool lock_scene=true);
 
        void device_free();
@@ -168,6 +174,8 @@ public:
         * (for example, when rendering with unlimited samples). */
        float get_progress();
 
+       void collect_statistics(RenderStats *stats);
+
 protected:
        struct DelayedReset {
                thread_mutex mutex;
index 101d33fcf6530e73ffb4023b671c556e9754126b..71d9353ad3db410098b20358453d893a0ec04fad 100644 (file)
@@ -15,6 +15,7 @@
  */
 
 #include "render/stats.h"
+#include "render/object.h"
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
 #include "util/util_string.h"
@@ -33,6 +34,16 @@ bool namedSizeEntryComparator(const NamedSizeEntry& a, const NamedSizeEntry& b)
        return a.size > b.size;
 }
 
+bool namedTimeSampleEntryComparator(const NamedNestedSampleStats& a, const NamedNestedSampleStats& b)
+{
+       return a.sum_samples > b.sum_samples;
+}
+
+bool namedSampleCountPairComparator(const NamedSampleCountPair& a, const NamedSampleCountPair& b)
+{
+       return a.samples > b.samples;
+}
+
 }  // namespace
 
 NamedSizeEntry::NamedSizeEntry()
@@ -77,6 +88,113 @@ string NamedSizeStats::full_report(int indent_level)
        return result;
 }
 
+/* Named time sample statistics. */
+
+NamedNestedSampleStats::NamedNestedSampleStats()
+ : name(""), self_samples(0), sum_samples(0)
+{}
+
+NamedNestedSampleStats::NamedNestedSampleStats(const string& name, uint64_t samples)
+ : name(name), self_samples(samples), sum_samples(samples)
+{}
+
+NamedNestedSampleStats& NamedNestedSampleStats::add_entry(const string& name_, uint64_t samples_)
+{
+       entries.push_back(NamedNestedSampleStats(name_, samples_));
+       return entries[entries.size()-1];
+}
+
+void NamedNestedSampleStats::update_sum()
+{
+       sum_samples = self_samples;
+       foreach(NamedNestedSampleStats& entry, entries) {
+               entry.update_sum();
+               sum_samples += entry.sum_samples;
+       }
+}
+
+string NamedNestedSampleStats::full_report(int indent_level, uint64_t total_samples)
+{
+       update_sum();
+
+       if(total_samples == 0) {
+               total_samples = sum_samples;
+       }
+
+       const string indent(indent_level * kIndentNumSpaces, ' ');
+
+       const double sum_percent = 100*((double) sum_samples) / total_samples;
+       const double sum_seconds = sum_samples * 0.001;
+       const double self_percent = 100*((double) self_samples) / total_samples;
+       const double self_seconds = self_samples * 0.001;
+       string info = string_printf("%-32s: Total %3.2f%% (%.2fs), Self %3.2f%% (%.2fs)\n",
+                                   name.c_str(),
+                                   sum_percent,
+                                   sum_seconds,
+                                   self_percent,
+                                   self_seconds);
+       string result = indent + info; 
+
+       sort(entries.begin(), entries.end(), namedTimeSampleEntryComparator);
+       foreach(NamedNestedSampleStats& entry, entries) {
+               result += entry.full_report(indent_level + 1, total_samples);
+       }
+       return result;
+}
+
+/* Named sample count pairs. */
+
+NamedSampleCountPair::NamedSampleCountPair(const ustring& name, uint64_t samples, uint64_t hits)
+ : name(name), samples(samples), hits(hits)
+{}
+
+NamedSampleCountStats::NamedSampleCountStats()
+{}
+
+void NamedSampleCountStats::add(const ustring& name, uint64_t samples, uint64_t hits)
+{
+       entry_map::iterator entry = entries.find(name);
+       if(entry != entries.end()) {
+               entry->second.samples += samples;
+               entry->second.hits += hits;
+               return;
+       }
+       entries.emplace(name, NamedSampleCountPair(name, samples, hits));
+}
+
+string NamedSampleCountStats::full_report(int indent_level)
+{
+       const string indent(indent_level * kIndentNumSpaces, ' ');
+
+       vector<NamedSampleCountPair> sorted_entries;
+       sorted_entries.reserve(entries.size());
+
+       uint64_t total_hits = 0, total_samples = 0;
+       foreach(entry_map::const_reference entry, entries) {
+               const NamedSampleCountPair &pair = entry.second;
+
+               total_hits += pair.hits;
+               total_samples += pair.samples;
+
+               sorted_entries.push_back(pair);
+       }
+       const double avg_samples_per_hit = ((double) total_samples) / total_hits;
+
+       sort(sorted_entries.begin(), sorted_entries.end(), namedSampleCountPairComparator);
+
+       string result = "";
+       foreach(const NamedSampleCountPair& entry, sorted_entries) {
+               const double seconds = entry.samples * 0.001;
+               const double relative = ((double) entry.samples) / (entry.hits * avg_samples_per_hit);
+
+               result += indent + string_printf("%-32s: %.2fs (Relative cost: %.2f)\n",
+                                                entry.name.c_str(),
+                                                seconds,
+                                                relative);
+       }
+       return result;
+}
+
 /* Mesh statistics. */
 
 MeshStats::MeshStats() {
@@ -106,6 +224,74 @@ string ImageStats::full_report(int indent_level)
 /* Overall statistics. */
 
 RenderStats::RenderStats() {
+       has_profiling = false;
+}
+
+void RenderStats::collect_profiling(Scene *scene, Stats *stats)
+{
+       has_profiling = true;
+
+       Profiler &prof = stats->profiler;
+
+       kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN));
+
+       kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP));
+       kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT));
+
+       NamedNestedSampleStats &integrator = kernel.add_entry("Path integration", prof.get_event(PROFILING_PATH_INTEGRATE));
+       integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT));
+       integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION));
+       integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME));
+
+       NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0);
+       shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP));
+       shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL));
+       shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY));
+       shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO));
+       shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE));
+
+       integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT));
+       integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE));
+
+       NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0);
+       intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT));
+       intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL));
+       intersection.add_entry("Shadow All Intersection", prof.get_event(PROFILING_INTERSECT_SHADOW_ALL));
+       intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME));
+       intersection.add_entry("Volume All Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME_ALL));
+
+       NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0);
+       closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL));
+       closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE));
+       closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL));
+       closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE));
+
+       NamedNestedSampleStats &denoising = kernel.add_entry("Denoising", prof.get_event(PROFILING_DENOISING));
+       denoising.add_entry("Construct Transform", prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM));
+       denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT));
+
+       NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0);
+       prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW));
+       prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS));
+       prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE));
+       prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS));
+       prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES));
+
+       shaders.entries.clear();
+       foreach(Shader *shader, scene->shaders) {
+               uint64_t samples, hits;
+               if(prof.get_shader(shader->id, samples, hits)) {
+                       shaders.add(shader->name, samples, hits);
+               }
+       }
+
+       objects.entries.clear();
+       foreach(Object *object, scene->objects) {
+               uint64_t samples, hits;
+               if(prof.get_object(object->get_device_index(), samples, hits)) {
+                       objects.add(object->name, samples, hits);
+               }
+       }
 }
 
 string RenderStats::full_report()
@@ -113,6 +299,14 @@ string RenderStats::full_report()
        string result = "";
        result += "Mesh statistics:\n" + mesh.full_report(1);
        result += "Image statistics:\n" + image.full_report(1);
+       if(has_profiling) {
+               result += "Kernel statistics:\n" + kernel.full_report(1);
+               result += "Shader statistics:\n" + shaders.full_report(1);
+               result += "Object statistics:\n" + objects.full_report(1);
+       }
+       else {
+               result += "Profiling information not available (only works with CPU rendering)";
+       }
        return result;
 }
 
index 2ff0ec3e0e95004f3c38997ce8d5e1ebb41c8a30..ce0529bb8bd20b59ded4168cb6f9168c1c85201d 100644 (file)
@@ -17,6 +17,9 @@
 #ifndef __RENDER_STATS_H__
 #define __RENDER_STATS_H__
 
+#include "render/scene.h"
+
+#include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
 
@@ -61,6 +64,51 @@ public:
        vector<NamedSizeEntry> entries;
 };
 
+class NamedNestedSampleStats {
+public:
+       NamedNestedSampleStats();
+       NamedNestedSampleStats(const string& name, uint64_t samples);
+
+       NamedNestedSampleStats& add_entry(const string& name, uint64_t samples);
+
+       /* Updates sum_samples recursively. */
+       void update_sum();
+
+       string full_report(int indent_level = 0, uint64_t total_samples = 0);
+
+       string name;
+
+       /* self_samples contains only the samples that this specific event got,
+        * while sum_samples also includes the samples of all sub-entries. */
+       uint64_t self_samples, sum_samples;
+
+       vector<NamedNestedSampleStats> entries;
+};
+
+/* Named entry containing both a time-sample count for objects of a type and a
+ * total count of processed items.
+ * This allows to estimate the time spent per item. */
+class NamedSampleCountPair {
+public:
+       NamedSampleCountPair(const ustring& name, uint64_t samples, uint64_t hits);
+
+       ustring name;
+       uint64_t samples;
+       uint64_t hits;
+};
+
+/* Contains statistics about pairs of samples and counts as described above. */
+class NamedSampleCountStats {
+public:
+       NamedSampleCountStats();
+
+       string full_report(int indent_level = 0);
+       void add(const ustring& name, uint64_t samples, uint64_t hits);
+
+       typedef unordered_map<ustring, NamedSampleCountPair, ustringHash> entry_map;
+       entry_map entries;
+};
+
 /* Statistics about mesh in the render database. */
 class MeshStats {
 public:
@@ -95,8 +143,16 @@ public:
        /* Return full report as string. */
        string full_report();
 
+       /* Collect kernel sampling information from Stats. */
+       void collect_profiling(Scene *scene, Stats *stats);
+
+       bool has_profiling;
+
        MeshStats mesh;
        ImageStats image;
+       NamedNestedSampleStats kernel;
+       NamedSampleCountStats shaders;
+       NamedSampleCountStats objects;
 };
 
 CCL_NAMESPACE_END
index 77d47984ee797537d95dc779dc1f2eb871e91c41..92dfc9fa85d82df278297993808f07f9f2c2c04f 100644 (file)
@@ -17,6 +17,7 @@ set(SRC
        util_md5.cpp
        util_murmurhash.cpp
        util_path.cpp
+       util_profiling.cpp
        util_string.cpp
        util_simd.cpp
        util_system.cpp
@@ -71,6 +72,7 @@ set(SRC_HEADERS
        util_optimization.h
        util_param.h
        util_path.h
+       util_profiling.h
        util_progress.h
        util_projection.h
        util_queue.h
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
new file mode 100644 (file)
index 0000000..30aaef6
--- /dev/null
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_algorithm.h"
+#include "util/util_profiling.h"
+#include "util/util_set.h"
+
+CCL_NAMESPACE_BEGIN
+
+Profiler::Profiler()
+ : do_stop_worker(true), worker(NULL)
+{
+}
+
+Profiler::~Profiler()
+{
+       assert(worker == NULL);
+}
+
+void Profiler::run()
+{
+       uint64_t updates = 0;
+       auto start_time = std::chrono::system_clock::now();
+       while(!do_stop_worker) {
+               thread_scoped_lock lock(mutex);
+               foreach(ProfilingState *state, states) {
+                       uint32_t cur_event = state->event;
+                       int32_t cur_shader = state->shader;
+                       int32_t cur_object = state->object;
+
+                       /* The state reads/writes should be atomic, but just to be sure
+                        * check the values for validity anyways. */
+                       if(cur_event < PROFILING_NUM_EVENTS) {
+                               event_samples[cur_event]++;
+                       }
+
+                       if(cur_shader >= 0 && cur_shader < shader_samples.size()) {
+                               /* Only consider the active shader during events whose runtime significantly depends on it. */
+                               if(((cur_event >= PROFILING_SHADER_EVAL ) && (cur_event <= PROFILING_SUBSURFACE)) ||
+                                  ((cur_event >= PROFILING_CLOSURE_EVAL) && (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) {
+                                       shader_samples[cur_shader]++;
+                               }
+                       }
+
+                       if(cur_object >= 0 && cur_object < object_samples.size()) {
+                               object_samples[cur_object]++;
+                       }
+               }
+               lock.unlock();
+
+               /* Relative waits always overshoot a bit, so just waiting 1ms every
+                * time would cause the sampling to drift over time.
+                * By keeping track of the absolute time, the wait times correct themselves -
+                * if one wait overshoots a lot, the next one will be shorter to compensate. */
+               updates++;
+               std::this_thread::sleep_until(start_time + updates*std::chrono::milliseconds(1));
+       }
+}
+
+void Profiler::reset(int num_shaders, int num_objects)
+{
+       bool running = (worker != NULL);
+       if(running) {
+               stop();
+       }
+
+       /* Resize and clear the accumulation vectors. */
+       shader_hits.assign(num_shaders, 0);
+       object_hits.assign(num_objects, 0);
+
+       event_samples.assign(PROFILING_NUM_EVENTS, 0);
+       shader_samples.assign(num_shaders, 0);
+       object_samples.assign(num_objects, 0);
+
+       if(running) {
+               start();
+       }
+}
+
+void Profiler::start()
+{
+       assert(worker == NULL);
+       do_stop_worker = false;
+       worker = new thread(function_bind(&Profiler::run, this));
+}
+
+void Profiler::stop()
+{
+       if(worker != NULL) {
+               do_stop_worker = true;
+
+               worker->join();
+               delete worker;
+               worker = NULL;
+       }
+}
+
+void Profiler::add_state(ProfilingState *state)
+{
+       thread_scoped_lock lock(mutex);
+
+       /* Add the ProfilingState from the list of sampled states. */
+       assert(std::find(states.begin(), states.end(), state) == states.end());
+       states.push_back(state);
+
+       /* Resize thread-local hit counters. */
+       state->shader_hits.assign(shader_hits.size(), 0);
+       state->object_hits.assign(object_hits.size(), 0);
+
+       /* Initialize the state. */
+       state->event = PROFILING_UNKNOWN;
+       state->shader = -1;
+       state->object = -1;
+       state->active = true;
+}
+
+void Profiler::remove_state(ProfilingState *state)
+{
+       thread_scoped_lock lock(mutex);
+
+       /* Remove the ProfilingState from the list of sampled states. */
+       states.erase(std::remove(states.begin(), states.end(), state), states.end());
+       state->active = false;
+
+       /* Merge thread-local hit counters. */
+       assert(shader_hits.size() == state->shader_hits.size());
+       for(int i = 0; i < shader_hits.size(); i++) {
+               shader_hits[i] += state->shader_hits[i];
+       }
+
+       assert(object_hits.size() == state->object_hits.size());
+       for(int i = 0; i < object_hits.size(); i++) {
+               object_hits[i] += state->object_hits[i];
+       }
+}
+
+uint64_t Profiler::get_event(ProfilingEvent event)
+{
+       assert(worker == NULL);
+       return event_samples[event];
+}
+
+bool Profiler::get_shader(int shader, uint64_t &samples, uint64_t &hits)
+{
+       assert(worker == NULL);
+       if(shader_samples[shader] == 0) {
+               return false;
+       }
+       samples = shader_samples[shader];
+       hits = shader_hits[shader];
+       return true;
+}
+
+bool Profiler::get_object(int object, uint64_t &samples, uint64_t &hits)
+{
+       assert(worker == NULL);
+       if(object_samples[object] == 0) {
+               return false;
+       }
+       samples = object_samples[object];
+       hits = object_hits[object];
+       return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
new file mode 100644 (file)
index 0000000..4460402
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_PROFILING_H__
+#define __UTIL_PROFILING_H__
+
+#include <atomic>
+
+#include "util/util_foreach.h"
+#include "util/util_map.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum ccl_try_align(16) ProfilingEvent : uint32_t {
+       PROFILING_UNKNOWN,
+       PROFILING_RAY_SETUP,
+       PROFILING_PATH_INTEGRATE,
+       PROFILING_SCENE_INTERSECT,
+       PROFILING_INDIRECT_EMISSION,
+       PROFILING_VOLUME,
+       PROFILING_SHADER_SETUP,
+       PROFILING_SHADER_EVAL,
+       PROFILING_SHADER_APPLY,
+       PROFILING_AO,
+       PROFILING_SUBSURFACE,
+       PROFILING_CONNECT_LIGHT,
+       PROFILING_SURFACE_BOUNCE,
+       PROFILING_WRITE_RESULT,
+
+       PROFILING_INTERSECT,
+       PROFILING_INTERSECT_LOCAL,
+       PROFILING_INTERSECT_SHADOW_ALL,
+       PROFILING_INTERSECT_VOLUME,
+       PROFILING_INTERSECT_VOLUME_ALL,
+
+       PROFILING_CLOSURE_EVAL,
+       PROFILING_CLOSURE_SAMPLE,
+       PROFILING_CLOSURE_VOLUME_EVAL,
+       PROFILING_CLOSURE_VOLUME_SAMPLE,
+
+       PROFILING_DENOISING,
+       PROFILING_DENOISING_CONSTRUCT_TRANSFORM,
+       PROFILING_DENOISING_RECONSTRUCT,
+       PROFILING_DENOISING_DIVIDE_SHADOW,
+       PROFILING_DENOISING_NON_LOCAL_MEANS,
+       PROFILING_DENOISING_COMBINE_HALVES,
+       PROFILING_DENOISING_GET_FEATURE,
+       PROFILING_DENOISING_DETECT_OUTLIERS,
+
+       PROFILING_NUM_EVENTS,
+};
+
+/* Contains the current execution state of a worker thread.
+ * These values are constantly updated by the worker.
+ * Periodically the profiler thread will wake up, read them
+ * and update its internal counters based on it.
+ *
+ * Atomics aren't needed here since we're only doing direct
+ * writes and reads to (4-byte-aligned) uint32_t, which is
+ * guaranteed to be atomic on x86 since the 486.
+ * Memory ordering is not guaranteed but does not matter.
+ *
+ * And even on other architectures, the extremely rare corner
+ * case of reading an intermediate state could at worst result
+ * in a single incorrect sample. */
+struct ProfilingState {
+       volatile uint32_t event = PROFILING_UNKNOWN;
+       volatile int32_t shader = -1;
+       volatile int32_t object = -1;
+       volatile bool active = false;
+
+       vector<uint64_t> shader_hits;
+       vector<uint64_t> object_hits;
+};
+
+class Profiler {
+public:
+       Profiler();
+       ~Profiler();
+
+       void reset(int num_shaders, int num_objects);
+
+       void start();
+       void stop();
+
+       void add_state(ProfilingState *state);
+       void remove_state(ProfilingState *state);
+
+       uint64_t get_event(ProfilingEvent event);
+       bool get_shader(int shader, uint64_t &samples, uint64_t &hits);
+       bool get_object(int object, uint64_t &samples, uint64_t &hits);
+
+protected:
+       void run();
+
+       /* Tracks how often the worker was in each ProfilingEvent while sampling,
+        * so multiplying the values by the sample frequency (currently 1ms)
+        * gives the approximate time spent in each state. */
+       vector<uint64_t> event_samples;
+       vector<uint64_t> shader_samples;
+       vector<uint64_t> object_samples;
+
+       /* Tracks the total amounts every object/shader was hit.
+        * Used to evaluate relative cost, written by the render thread.
+        * Indexed by the shader and object IDs that the kernel also uses
+        * to index __object_flag and __shaders. */
+       vector<uint64_t> shader_hits;
+       vector<uint64_t> object_hits;
+
+       volatile bool do_stop_worker;
+       thread *worker;
+
+       thread_mutex mutex;
+       vector<ProfilingState*> states;
+};
+
+class ProfilingHelper {
+public:
+       ProfilingHelper(ProfilingState *state, ProfilingEvent event)
+        : state(state)
+       {
+               previous_event = state->event;
+               state->event = event;
+       }
+
+       inline void set_event(ProfilingEvent event)
+       {
+               state->event = event;
+       }
+
+       inline void set_shader(int shader)
+       {
+               state->shader = shader;
+               if(state->active) {
+                       assert(shader < state->shader_hits.size());
+                       state->shader_hits[shader]++;
+               }
+       }
+
+       inline void set_object(int object)
+       {
+               state->object = object;
+               if(state->active) {
+                       assert(object < state->object_hits.size());
+                       state->object_hits[object]++;
+               }
+       }
+
+       ~ProfilingHelper()
+       {
+               state->event = previous_event;
+       }
+private:
+       ProfilingState *state;
+       uint32_t previous_event;
+};
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_PROFILING_H__ */
index 0ba58422a6778ef0573a44c6d52a7c62d52dd1cc..1128bc3ced12b15163f8a100aadd53f996c3f81c 100644 (file)
@@ -18,6 +18,7 @@
 #define __UTIL_STATS_H__
 
 #include "util/util_atomic.h"
+#include "util/util_profiling.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -40,6 +41,8 @@ public:
 
        size_t mem_used;
        size_t mem_peak;
+
+       Profiler profiler;
 };
 
 CCL_NAMESPACE_END