Cuda use streams and async to avoid busywaiting
authorMartijn Berger <martijn.berger@gmail.com>
Thu, 6 Mar 2014 19:51:13 +0000 (20:51 +0100)
committerMartijn Berger <martijn.berger@gmail.com>
Thu, 6 Mar 2014 19:51:46 +0000 (20:51 +0100)
This switches api usage for cuda towards using more of the Async calls.

Updating only once every second is sufficiently cheap that I don't think it is worth doing it less often.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D262

intern/cycles/device/device_cuda.cpp

index 0fbb48cf43157600459d8d40316e9ae9d13d1d08..932fdc303a5210c87a9dba0bb77cf44bf6f4ca9f 100644 (file)
@@ -41,11 +41,14 @@ public:
        CUdevice cuDevice;
        CUcontext cuContext;
        CUmodule cuModule;
+       CUstream cuStream;
+       CUevent tileDone;
        map<device_ptr, bool> tex_interp_map;
        int cuDevId;
        int cuDevArchitecture;
        bool first_error;
        bool use_texture_storage;
+       unsigned int target_update_frequency;
 
        struct PixelMem {
                GLuint cuPBO;
@@ -177,6 +180,8 @@ public:
                first_error = true;
                background = background_;
                use_texture_storage = true;
+               /* we try an update / sync every 1000 ms */
+               target_update_frequency = 1000;
 
                cuDevId = info.num;
                cuDevice = 0;
@@ -207,6 +212,9 @@ public:
                if(cuda_error_(result, "cuCtxCreate"))
                        return;
 
+               cuda_assert(cuStreamCreate(&cuStream, 0))
+               cuda_assert(cuEventCreate(&tileDone, 0x1))
+
                int major, minor;
                cuDeviceComputeCapability(&major, &minor, cuDevId);
                cuDevArchitecture = major*100 + minor*10;
@@ -223,6 +231,8 @@ public:
        {
                task_pool.stop();
 
+               cuda_assert(cuEventDestroy(tileDone))
+               cuda_assert(cuStreamDestroy(cuStream))
                cuda_assert(cuCtxDestroy(cuContext))
        }
 
@@ -645,9 +655,7 @@ public:
 
                cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
                cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
-               cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
-
-               cuda_assert(cuCtxSynchronize())
+               cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream))
 
                cuda_pop_context();
        }
@@ -964,11 +972,16 @@ public:
                        
                        bool branched = task->integrator_branched;
                        
+
                        /* keep rendering tiles until done */
                        while(task->acquire_tile(this, tile)) {
                                int start_sample = tile.start_sample;
                                int end_sample = tile.start_sample + tile.num_samples;
 
+                               boost::posix_time::ptime start_time(boost::posix_time::microsec_clock::local_time());
+                               boost::posix_time::ptime last_time = start_time;
+                               int sync_sample = 10;
+
                                for(int sample = start_sample; sample < end_sample; sample++) {
                                        if (task->get_cancel()) {
                                                if(task->need_finish_queue == false)
@@ -978,8 +991,28 @@ public:
                                        path_trace(tile, sample, branched);
 
                                        tile.sample = sample + 1;
-
                                        task->update_progress(tile);
+
+                                       if(sample == sync_sample){
+                                               cuda_push_context();
+                                               cuda_assert(cuEventRecord(tileDone, cuStream ))
+                                               cuda_assert(cuEventSynchronize(tileDone))
+
+                                               /* Do some time keeping to find out if we need to sync less */
+                                               boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time());
+                                               boost::posix_time::time_duration sample_duration = current_time - last_time;
+
+                                               long msec = sample_duration.total_milliseconds();
+                                               float scaling_factor = (float)target_update_frequency / (float)msec;
+
+                                               /* sync at earliest next sample and probably later */
+                                               sync_sample = (sample + 1) + sync_sample * ceil(scaling_factor);
+
+                                               sync_sample = min(end_sample - 1, sync_sample); // make sure we sync the last sample always
+
+                                               last_time = current_time;
+                                               cuda_pop_context();
+                                       }
                                }
 
                                task->release_tile(tile);