Image viewer scopes update: OMP->BLI_task.
authorBastien Montagne <montagne29@wanadoo.fr>
Thu, 26 May 2016 12:30:14 +0000 (14:30 +0200)
committerBastien Montagne <montagne29@wanadoo.fr>
Thu, 26 May 2016 12:33:26 +0000 (14:33 +0200)
Gives over 50% faster scope update (from 4.5ms to 2.2ms here with SD shot)!
Probably mostly due to more clever usage of thread-local data (which avoids any lock,
when OMP code had a rather stupid critical section for minmax)...

source/blender/blenkernel/intern/colortools.c

index bac59c8c62d071854ca1769eb614fd0dd5a67139..c1f1f0128f510f42f65551d165b9eccfcaeabab6 100644 (file)
@@ -43,6 +43,7 @@
 #include "BLI_blenlib.h"
 #include "BLI_math.h"
 #include "BLI_utildefines.h"
+#include "BLI_task.h"
 #include "BLI_threads.h"
 
 #include "BKE_colortools.h"
 #include "IMB_colormanagement.h"
 #include "IMB_imbuf_types.h"
 
-#ifdef _OPENMP
-#  include <omp.h>
-#endif
-
 /* ********************************* color curve ********************* */
 
 /* ***************** operations on full struct ************* */
@@ -1089,31 +1086,170 @@ void BKE_histogram_update_sample_line(Histogram *hist, ImBuf *ibuf, const ColorM
 }
 
 /* if view_settings, it also applies this to byte buffers */
+typedef struct ScopesUpdateData {
+       Scopes *scopes;
+       const ImBuf *ibuf;
+       struct ColormanageProcessor *cm_processor;
+       const unsigned char *display_buffer;
+       const int ycc_mode;
+
+       unsigned int *bin_lum, *bin_r, *bin_g, *bin_b, *bin_a;
+} ScopesUpdateData;
+
+typedef struct ScopesUpdateDataChunk {
+       unsigned int bin_lum[256];
+       unsigned int bin_r[256];
+       unsigned int bin_g[256];
+       unsigned int bin_b[256];
+       unsigned int bin_a[256];
+       float min[3], max[3];
+} ScopesUpdateDataChunk;
+
+static void scopes_update_cb(void *userdata, void *userdata_chunk, const int y, const int UNUSED(threadid))
+{
+       const ScopesUpdateData *data = userdata;
+
+       Scopes *scopes = data->scopes;
+       const ImBuf *ibuf = data->ibuf;
+       struct ColormanageProcessor *cm_processor = data->cm_processor;
+       const unsigned char *display_buffer = data->display_buffer;
+       const int ycc_mode = data->ycc_mode;
+
+       ScopesUpdateDataChunk *data_chunk = userdata_chunk;
+       unsigned int *bin_lum = data_chunk->bin_lum;
+       unsigned int *bin_r = data_chunk->bin_r;
+       unsigned int *bin_g = data_chunk->bin_g;
+       unsigned int *bin_b = data_chunk->bin_b;
+       unsigned int *bin_a = data_chunk->bin_a;
+       float *min = data_chunk->min;
+       float *max = data_chunk->max;
+
+       const float *rf = NULL;
+       const unsigned char *rc = NULL;
+       const int rows_per_sample_line = ibuf->y / scopes->sample_lines;
+       const int savedlines = y / rows_per_sample_line;
+       const bool do_sample_line = (savedlines < scopes->sample_lines) && (y % rows_per_sample_line) == 0;
+       const bool is_float = (ibuf->rect_float != NULL);
+
+       if (is_float)
+               rf = ibuf->rect_float + ((size_t)y) * ibuf->x * ibuf->channels;
+       else {
+               rc = display_buffer + ((size_t)y) * ibuf->x * ibuf->channels;
+       }
+
+       for (int x = 0; x < ibuf->x; x++) {
+               float rgba[4], ycc[3], luma;
+
+               if (is_float) {
+                       switch (ibuf->channels) {
+                               case 4:
+                                       copy_v4_v4(rgba, rf);
+                                       IMB_colormanagement_processor_apply_v4(cm_processor, rgba);
+                                       break;
+                               case 3:
+                                       copy_v3_v3(rgba, rf);
+                                       IMB_colormanagement_processor_apply_v3(cm_processor, rgba);
+                                       rgba[3] = 1.0f;
+                                       break;
+                               case 2:
+                                       copy_v3_fl(rgba, rf[0]);
+                                       rgba[3] = rf[1];
+                                       break;
+                               case 1:
+                                       copy_v3_fl(rgba, rf[0]);
+                                       rgba[3] = 1.0f;
+                                       break;
+                               default:
+                                       BLI_assert(0);
+                       }
+               }
+               else {
+                       for (int c = 4; c--;)
+                               rgba[c] = rc[c] * INV_255;
+               }
+
+               /* we still need luma for histogram */
+               luma = IMB_colormanagement_get_luminance(rgba);
+
+               /* check for min max */
+               if (ycc_mode == -1) {
+                       minmax_v3v3_v3(min, max, rgba);
+               }
+               else {
+                       rgb_to_ycc(rgba[0], rgba[1], rgba[2], &ycc[0], &ycc[1], &ycc[2], ycc_mode);
+                       mul_v3_fl(ycc, INV_255);
+                       minmax_v3v3_v3(min, max, ycc);
+               }
+               /* increment count for histo*/
+               bin_lum[get_bin_float(luma)]++;
+               bin_r[get_bin_float(rgba[0])]++;
+               bin_g[get_bin_float(rgba[1])]++;
+               bin_b[get_bin_float(rgba[2])]++;
+               bin_a[get_bin_float(rgba[3])]++;
+
+               /* save sample if needed */
+               if (do_sample_line) {
+                       const float fx = (float)x / (float)ibuf->x;
+                       const int idx = 2 * (ibuf->x * savedlines + x);
+                       save_sample_line(scopes, idx, fx, rgba, ycc);
+               }
+
+               rf += ibuf->channels;
+               rc += ibuf->channels;
+       }
+}
+
+static void scopes_update_finalize(void *userdata, void *userdata_chunk)
+{
+       const ScopesUpdateData *data = userdata;
+       const ScopesUpdateDataChunk *data_chunk = userdata_chunk;
+
+       unsigned int *bin_lum = data->bin_lum;
+       unsigned int *bin_r = data->bin_r;
+       unsigned int *bin_g = data->bin_g;
+       unsigned int *bin_b = data->bin_b;
+       unsigned int *bin_a = data->bin_a;
+       const unsigned int *bin_lum_c = data_chunk->bin_lum;
+       const unsigned int *bin_r_c = data_chunk->bin_r;
+       const unsigned int *bin_g_c = data_chunk->bin_g;
+       const unsigned int *bin_b_c = data_chunk->bin_b;
+       const unsigned int *bin_a_c = data_chunk->bin_a;
+
+       float (*minmax)[2] = data->scopes->minmax;
+       const float *min = data_chunk->min;
+       const float *max = data_chunk->max;
+
+       for (int b = 256; b--;) {
+               bin_lum[b] += bin_lum_c[b];
+               bin_r[b] += bin_r_c[b];
+               bin_g[b] += bin_g_c[b];
+               bin_b[b] += bin_b_c[b];
+               bin_a[b] += bin_a_c[b];
+       }
+
+       for (int c = 3; c--;) {
+               if (min[c] < minmax[c][0])
+                       minmax[c][0] = min[c];
+               if (max[c] > minmax[c][1])
+                       minmax[c][1] = max[c];
+       }
+}
+
 void scopes_update(Scopes *scopes, ImBuf *ibuf, const ColorManagedViewSettings *view_settings,
                    const ColorManagedDisplaySettings *display_settings)
 {
-#ifdef _OPENMP
-       const int num_threads = BLI_system_thread_count();
-#endif
-       int a, y;
+       int a;
        unsigned int nl, na, nr, ng, nb;
        double divl, diva, divr, divg, divb;
-       unsigned char *display_buffer;
+       const unsigned char *display_buffer = NULL;
        unsigned int bin_lum[256] = {0},
                     bin_r[256] = {0},
                     bin_g[256] = {0},
                     bin_b[256] = {0},
                     bin_a[256] = {0};
-       unsigned int bin_lum_t[BLENDER_MAX_THREADS][256] = {{0}},
-                    bin_r_t[BLENDER_MAX_THREADS][256] = {{0}},
-                    bin_g_t[BLENDER_MAX_THREADS][256] = {{0}},
-                    bin_b_t[BLENDER_MAX_THREADS][256] = {{0}},
-                    bin_a_t[BLENDER_MAX_THREADS][256] = {{0}};
        int ycc_mode = -1;
-       const bool is_float = (ibuf->rect_float != NULL);
        void *cache_handle = NULL;
        struct ColormanageProcessor *cm_processor = NULL;
-       int rows_per_sample_line;
 
        if (ibuf->rect == NULL && ibuf->rect_float == NULL) return;
 
@@ -1151,7 +1287,6 @@ void scopes_update(Scopes *scopes, ImBuf *ibuf, const ColorManagedViewSettings *
                scopes->sample_lines = ibuf->y;
 
        /* scan the image */
-       rows_per_sample_line = ibuf->y / scopes->sample_lines;
        for (a = 0; a < 3; a++) {
                scopes->minmax[a][0] = 25500.0f;
                scopes->minmax[a][1] = -25500.0f;
@@ -1177,129 +1312,21 @@ void scopes_update(Scopes *scopes, ImBuf *ibuf, const ColorManagedViewSettings *
                cm_processor = IMB_colormanagement_display_processor_new(view_settings, display_settings);
        }
        else {
-               display_buffer = (unsigned char *)IMB_display_buffer_acquire(ibuf,
-                                                                            view_settings,
-                                                                            display_settings,
-                                                                            &cache_handle);
+               display_buffer = (const unsigned char *)IMB_display_buffer_acquire(
+                                                           ibuf, view_settings, display_settings, &cache_handle);
        }
 
        /* Keep number of threads in sync with the merge parts below. */
-#pragma omp parallel for private(y) schedule(static) num_threads(num_threads) if (ibuf->y > 256)
-       for (y = 0; y < ibuf->y; y++) {
-#ifdef _OPENMP
-               const int thread_idx = omp_get_thread_num();
-#else
-               const int thread_idx = 0;
-#endif
-               const float *rf = NULL;
-               const unsigned char *rc = NULL;
-               const int savedlines = y / rows_per_sample_line;
-               const bool do_sample_line = (savedlines < scopes->sample_lines) && (y % rows_per_sample_line) == 0;
-               float min[3] = { FLT_MAX,  FLT_MAX,  FLT_MAX},
-                     max[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
-               int x, c;
-               if (is_float)
-                       rf = ibuf->rect_float + ((size_t)y) * ibuf->x * ibuf->channels;
-               else {
-                       rc = display_buffer + ((size_t)y) * ibuf->x * ibuf->channels;
-               }
-               for (x = 0; x < ibuf->x; x++) {
-                       float rgba[4], ycc[3], luma;
-                       if (is_float) {
-
-                               switch (ibuf->channels) {
-                                       case 4:
-                                               copy_v4_v4(rgba, rf);
-                                               IMB_colormanagement_processor_apply_v4(cm_processor, rgba);
-                                               break;
-                                       case 3:
-                                               copy_v3_v3(rgba, rf);
-                                               IMB_colormanagement_processor_apply_v3(cm_processor, rgba);
-                                               rgba[3] = 1.0f;
-                                               break;
-                                       case 2:
-                                               copy_v3_fl(rgba, rf[0]);
-                                               rgba[3] = rf[1];
-                                               break;
-                                       case 1:
-                                               copy_v3_fl(rgba, rf[0]);
-                                               rgba[3] = 1.0f;
-                                               break;
-                                       default:
-                                               BLI_assert(0);
-                               }
-                       }
-                       else {
-                               for (c = 0; c < 4; c++)
-                                       rgba[c] = rc[c] * INV_255;
-                       }
-
-                       /* we still need luma for histogram */
-                       luma = IMB_colormanagement_get_luminance(rgba);
-
-                       /* check for min max */
-                       if (ycc_mode == -1) {
-                               for (c = 0; c < 3; c++) {
-                                       if (rgba[c] < min[c]) min[c] = rgba[c];
-                                       if (rgba[c] > max[c]) max[c] = rgba[c];
-                               }
-                       }
-                       else {
-                               rgb_to_ycc(rgba[0], rgba[1], rgba[2], &ycc[0], &ycc[1], &ycc[2], ycc_mode);
-                               for (c = 0; c < 3; c++) {
-                                       ycc[c] *= INV_255;
-                                       if (ycc[c] < min[c]) min[c] = ycc[c];
-                                       if (ycc[c] > max[c]) max[c] = ycc[c];
-                               }
-                       }
-                       /* increment count for histo*/
-                       bin_lum_t[thread_idx][get_bin_float(luma)] += 1;
-                       bin_r_t[thread_idx][get_bin_float(rgba[0])] += 1;
-                       bin_g_t[thread_idx][get_bin_float(rgba[1])] += 1;
-                       bin_b_t[thread_idx][get_bin_float(rgba[2])] += 1;
-                       bin_a_t[thread_idx][get_bin_float(rgba[3])] += 1;
-
-                       /* save sample if needed */
-                       if (do_sample_line) {
-                               const float fx = (float)x / (float)ibuf->x;
-                               const int idx = 2 * (ibuf->x * savedlines + x);
-                               save_sample_line(scopes, idx, fx, rgba, ycc);
-                       }
-
-                       rf += ibuf->channels;
-                       rc += ibuf->channels;
-               }
-#pragma omp critical
-               {
-                       for (c = 0; c < 3; c++) {
-                               if (min[c] < scopes->minmax[c][0]) scopes->minmax[c][0] = min[c];
-                               if (max[c] > scopes->minmax[c][1]) scopes->minmax[c][1] = max[c];
-                       }
-               }
-       }
-
-#ifdef _OPENMP
-       if (ibuf->y > 256) {
-               for (a = 0; a < num_threads; a++) {
-                       int b;
-                       for (b = 0; b < 256; b++) {
-                               bin_lum[b] += bin_lum_t[a][b];
-                               bin_r[b] += bin_r_t[a][b];
-                               bin_g[b] += bin_g_t[a][b];
-                               bin_b[b] += bin_b_t[a][b];
-                               bin_a[b] += bin_a_t[a][b];
-                       }
-               }
-       }
-       else
-#endif
-       {
-               memcpy(bin_lum, bin_lum_t[0], sizeof(bin_lum));
-               memcpy(bin_r, bin_r_t[0], sizeof(bin_r));
-               memcpy(bin_g, bin_g_t[0], sizeof(bin_g));
-               memcpy(bin_b, bin_b_t[0], sizeof(bin_b));
-               memcpy(bin_a, bin_a_t[0], sizeof(bin_a));
-       }
+       ScopesUpdateData data = {
+               .scopes = scopes, . ibuf = ibuf,
+               .cm_processor = cm_processor, .display_buffer = display_buffer, .ycc_mode = ycc_mode,
+               .bin_lum = bin_lum, .bin_r = bin_r, .bin_g = bin_g, .bin_b = bin_b, .bin_a = bin_a,
+       };
+       ScopesUpdateDataChunk data_chunk = {0};
+       INIT_MINMAX(data_chunk.min, data_chunk.max);
+
+       BLI_task_parallel_range_finalize(0, ibuf->y, &data, &data_chunk, sizeof(data_chunk),
+                                        scopes_update_cb, scopes_update_finalize, ibuf->y > 256, false);
 
        /* test for nicer distribution even - non standard, leave it out for a while */
 #if 0