[FFmpeg-devel] [PATCH] libavcodec/pthread_slice: for large job counts, avoid lock/unlock between jobs on the same thread

Tom Butterworth bangnoise at gmail.com
Mon Jul 13 18:34:00 CEST 2015


Currently for large job counts, pthread_slice.c acquires and releases a lock
between each job. Acquiring the locks can take more time than the job itself.

The DDS and Hap decoders naively create a job per 4x4 pixel block. For a 4Kx2K
frame:

    decode before patch: 1562ms
     decode after patch: 14ms

Clients probably should be able to submit jobs without having to consider the
number of threads the jobs will be run on, and this is a possible solution to
that. It makes the assumption that all jobs will take roughly the same amount
of time and that threads will be scheduled evenly.

---
 libavcodec/pthread_slice.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index c8e69f0..751e5f4 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -50,9 +50,11 @@ typedef struct SliceThreadContext {
     action_func2 *func2;
     void *args;
     int *rets;
+    int *subjob_limits;
+    int *subjob_offsets;
     int rets_count;
     int job_count;
-    int job_size;
+    int subjob_size;

     pthread_cond_t last_job_cond;
     pthread_cond_t current_job_cond;
@@ -76,6 +78,7 @@ static void* attribute_align_arg worker(void *v)
     int our_job = c->job_count;
     int thread_count = avctx->thread_count;
     int self_id;
+    int i;

     pthread_mutex_lock(&c->current_job_lock);
     self_id = c->current_job++;
@@ -96,8 +99,10 @@ static void* attribute_align_arg worker(void *v)
         }
         pthread_mutex_unlock(&c->current_job_lock);

-        c->rets[our_job%c->rets_count] = c->func ? c->func(avctx, (char*)c->args + our_job*c->job_size):
-                                                   c->func2(avctx, c->args, our_job, self_id);
+        for (i = c->subjob_offsets[our_job]; i < c->subjob_limits[our_job]; i++) {
+            c->rets[i%c->rets_count] = c->func ? c->func(avctx, (char*)c->args + i*c->subjob_size):
+            c->func2(avctx, c->args, i, self_id);
+        }

         pthread_mutex_lock(&c->current_job_lock);
         our_job = c->current_job++;
@@ -133,6 +138,8 @@ void ff_slice_thread_free(AVCodecContext *avctx)
     av_freep(&c->progress_cond);

     av_freep(&c->workers);
+    av_freep(&c->subjob_limits);
+    av_freep(&c->subjob_offsets);
     av_freep(&avctx->internal->thread_ctx);
 }

@@ -146,7 +153,7 @@ static av_always_inline void thread_park_workers(SliceThreadContext *c, int thre
 static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int *ret, int job_count, int job_size)
 {
     SliceThreadContext *c = avctx->internal->thread_ctx;
-    int dummy_ret;
+    int dummy_ret, i;

     if (!(avctx->active_thread_type&FF_THREAD_SLICE) || avctx->thread_count <= 1)
         return avcodec_default_execute(avctx, func, arg, ret, job_count, job_size);
@@ -157,8 +164,14 @@ static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, i
     pthread_mutex_lock(&c->current_job_lock);

     c->current_job = avctx->thread_count;
-    c->job_count = job_count;
-    c->job_size = job_size;
+    c->job_count = FFMIN(job_count, avctx->thread_count);
+    c->subjob_offsets[0] = 0;
+    c->subjob_limits[0] = (job_count / c->job_count) + (job_count % c->job_count);
+    for (i = 1; i < c->job_count; i++) {
+        c->subjob_offsets[i] = c->subjob_limits[i-1];
+        c->subjob_limits[i] = c->subjob_offsets[i] + (job_count / c->job_count);
+    }
+    c->subjob_size = job_size;
     c->args = arg;
     c->func = func;
     if (ret) {
@@ -218,17 +231,29 @@ int ff_slice_thread_init(AVCodecContext *avctx)
         av_free(c);
         return -1;
     }
+    c->subjob_offsets = av_mallocz_array(thread_count, sizeof(int));
+    if (!c->subjob_offsets) {
+        av_free(c);
+        return -1;
+    }
+    c->subjob_limits = av_mallocz_array(thread_count, sizeof(int));
+    if (!c->subjob_limits) {
+        av_free(c);
+        return -1;
+    }
+

     avctx->internal->thread_ctx = c;
     c->current_job = 0;
     c->job_count = 0;
-    c->job_size = 0;
+    c->subjob_size = 0;
     c->done = 0;
     pthread_cond_init(&c->current_job_cond, NULL);
     pthread_cond_init(&c->last_job_cond, NULL);
     pthread_mutex_init(&c->current_job_lock, NULL);
     pthread_mutex_lock(&c->current_job_lock);
     for (i=0; i<thread_count; i++) {
+        c->subjob_limits[i] = c->subjob_offsets[i] = 0;
         if(pthread_create(&c->workers[i], NULL, worker, avctx)) {
            avctx->thread_count = i;
            pthread_mutex_unlock(&c->current_job_lock);
--
2.3.2 (Apple Git-55)


More information about the ffmpeg-devel mailing list