[FFmpeg-cvslog] hwcontext_vulkan: use all enabled queues for transfers, make uploads async

Lynne git at videolan.org
Sat May 23 21:08:48 EEST 2020

ffmpeg | branch: master | Lynne <dev at lynne.ee> | Thu May 14 00:28:00 2020 +0100| [c0b0807871985c1d136a7ffd1470aa3e5155b422] | committer: Lynne

hwcontext_vulkan: use all enabled queues for transfers, make uploads async

This commit makes full use of the enabled queues to provide asynchronous
uploads of images (downloads remain synchronous).
For a pure uploading use cases, the performance gains can be significant.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c0b0807871985c1d136a7ffd1470aa3e5155b422

 libavutil/hwcontext_vulkan.c | 333 ++++++++++++++++++++++++++++++-------------
 1 file changed, 237 insertions(+), 96 deletions(-)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index 2f4fe6a797..800db74f74 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -41,11 +41,23 @@
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(cuda_cu, cu, x)
+typedef struct VulkanQueueCtx {
+    VkFence fence;
+    VkQueue queue;
+    int was_synchronous;
+    /* Buffer dependencies */
+    AVBufferRef **buf_deps;
+    int nb_buf_deps;
+    int buf_deps_alloc_size;
+} VulkanQueueCtx;
 typedef struct VulkanExecCtx {
     VkCommandPool pool;
-    VkCommandBuffer buf;
-    VkQueue queue;
-    VkFence fence;
+    VkCommandBuffer *bufs;
+    VulkanQueueCtx *queues;
+    int nb_queues;
+    int cur_queue_idx;
 } VulkanExecCtx;
 typedef struct VulkanDevicePriv {
@@ -60,8 +72,9 @@ typedef struct VulkanDevicePriv {
     /* Debug callback */
     VkDebugUtilsMessengerEXT debug_ctx;
-    /* Image uploading */
-    VulkanExecCtx cmd;
+    /* Image transfers */
+    VulkanExecCtx upload_ctx;
+    VulkanExecCtx download_ctx;
     /* Extensions */
     uint64_t extensions;
@@ -89,6 +102,16 @@ typedef struct AVVkFrameInternal {
 } AVVkFrameInternal;
+#define GET_QUEUE_COUNT(hwctx, graph, comp, tx) (                   \
+    graph ?  hwctx->nb_graphics_queues :                            \
+    comp  ? (hwctx->nb_comp_queues ?                                \
+             hwctx->nb_comp_queues : hwctx->nb_graphics_queues) :   \
+    tx    ? (hwctx->nb_tx_queues ? hwctx->nb_tx_queues :            \
+             (hwctx->nb_comp_queues ?                               \
+              hwctx->nb_comp_queues : hwctx->nb_graphics_queues)) : \
+    0                                                               \
 #define VK_LOAD_PFN(inst, name) PFN_##name pfn_##name = (PFN_##name)           \
                                               vkGetInstanceProcAddr(inst, #name)
@@ -709,7 +732,7 @@ fail:
 static int create_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd,
-                           int queue_family_index)
+                           int queue_family_index, int num_queues)
     VkResult ret;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
@@ -722,21 +745,20 @@ static int create_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd,
     VkCommandBufferAllocateInfo cbuf_create = {
         .level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
-        .commandBufferCount = 1,
+        .commandBufferCount = num_queues,
-    VkFenceCreateInfo fence_spawn = {
-    };
+    cmd->nb_queues = num_queues;
-    ret = vkCreateFence(hwctx->act_dev, &fence_spawn,
-                        hwctx->alloc, &cmd->fence);
-    if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to create frame fence: %s\n",
-               vk_ret2str(ret));
-        return AVERROR_EXTERNAL;
-    }
+    cmd->queues = av_mallocz(num_queues * sizeof(*cmd->queues));
+    if (!cmd->queues)
+        return AVERROR(ENOMEM);
+    cmd->bufs = av_mallocz(num_queues * sizeof(*cmd->bufs));
+    if (!cmd->bufs)
+        return AVERROR(ENOMEM);
+    /* Create command pool */
     ret = vkCreateCommandPool(hwctx->act_dev, &cqueue_create,
                               hwctx->alloc, &cmd->pool);
     if (ret != VK_SUCCESS) {
@@ -747,15 +769,19 @@ static int create_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd,
     cbuf_create.commandPool = cmd->pool;
-    ret = vkAllocateCommandBuffers(hwctx->act_dev, &cbuf_create, &cmd->buf);
+    /* Allocate command buffer */
+    ret = vkAllocateCommandBuffers(hwctx->act_dev, &cbuf_create, cmd->bufs);
     if (ret != VK_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Command buffer alloc failure: %s\n",
         return AVERROR_EXTERNAL;
-    vkGetDeviceQueue(hwctx->act_dev, cqueue_create.queueFamilyIndex, 0,
-                     &cmd->queue);
+    for (int i = 0; i < num_queues; i++) {
+        VulkanQueueCtx *q = &cmd->queues[i];
+        vkGetDeviceQueue(hwctx->act_dev, queue_family_index, i, &q->queue);
+        q->was_synchronous = 1;
+    }
     return 0;
@@ -764,12 +790,154 @@ static void free_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd)
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
-    if (cmd->fence)
-        vkDestroyFence(hwctx->act_dev, cmd->fence, hwctx->alloc);
-    if (cmd->buf)
-        vkFreeCommandBuffers(hwctx->act_dev, cmd->pool, 1, &cmd->buf);
+    /* Make sure all queues have finished executing */
+    for (int i = 0; i < cmd->nb_queues; i++) {
+        VulkanQueueCtx *q = &cmd->queues[i];
+        if (q->fence && !q->was_synchronous) {
+            vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX);
+            vkResetFences(hwctx->act_dev, 1, &q->fence);
+        }
+        /* Free the fence */
+        if (q->fence)
+            vkDestroyFence(hwctx->act_dev, q->fence, hwctx->alloc);
+        /* Free buffer dependencies */
+        for (int j = 0; j < q->nb_buf_deps; j++)
+            av_buffer_unref(&q->buf_deps[j]);
+        av_free(q->buf_deps);
+    }
+    if (cmd->bufs)
+        vkFreeCommandBuffers(hwctx->act_dev, cmd->pool, cmd->nb_queues, cmd->bufs);
     if (cmd->pool)
         vkDestroyCommandPool(hwctx->act_dev, cmd->pool, hwctx->alloc);
+    av_freep(&cmd->bufs);
+    av_freep(&cmd->queues);
+static VkCommandBuffer get_buf_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd)
+    return cmd->bufs[cmd->cur_queue_idx];
+static void unref_exec_ctx_deps(AVHWDeviceContext *ctx, VulkanExecCtx *cmd)
+    VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx];
+    for (int j = 0; j < q->nb_buf_deps; j++)
+        av_buffer_unref(&q->buf_deps[j]);
+    q->nb_buf_deps = 0;
+static int wait_start_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd)
+    VkResult ret;
+    AVVulkanDeviceContext *hwctx = ctx->hwctx;
+    VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx];
+    VkCommandBufferBeginInfo cmd_start = {
+    };
+    /* Create the fence and don't wait for it initially */
+    if (!q->fence) {
+        VkFenceCreateInfo fence_spawn = {
+        };
+        ret = vkCreateFence(hwctx->act_dev, &fence_spawn, hwctx->alloc,
+                            &q->fence);
+        if (ret != VK_SUCCESS) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to queue frame fence: %s\n",
+                   vk_ret2str(ret));
+            return AVERROR_EXTERNAL;
+        }
+    } else if (!q->was_synchronous) {
+        vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX);
+        vkResetFences(hwctx->act_dev, 1, &q->fence);
+    }
+    /* Discard queue dependencies */
+    unref_exec_ctx_deps(ctx, cmd);
+    ret = vkBeginCommandBuffer(cmd->bufs[cmd->cur_queue_idx], &cmd_start);
+    if (ret != VK_SUCCESS) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to init command buffer: %s\n",
+               vk_ret2str(ret));
+        return AVERROR_EXTERNAL;
+    }
+    return 0;
+static int add_buf_dep_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd,
+                                AVBufferRef * const *deps, int nb_deps)
+    AVBufferRef **dst;
+    VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx];
+    if (!deps || !nb_deps)
+        return 0;
+    dst = av_fast_realloc(q->buf_deps, &q->buf_deps_alloc_size,
+                          (q->nb_buf_deps + nb_deps) * sizeof(*dst));
+    if (!dst)
+        goto err;
+    q->buf_deps = dst;
+    for (int i = 0; i < nb_deps; i++) {
+        q->buf_deps[q->nb_buf_deps] = av_buffer_ref(deps[i]);
+        if (!q->buf_deps[q->nb_buf_deps])
+            goto err;
+        q->nb_buf_deps++;
+    }
+    return 0;
+    unref_exec_ctx_deps(ctx, cmd);
+    return AVERROR(ENOMEM);
+static int submit_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd,
+                           VkSubmitInfo *s_info, int synchronous)
+    VkResult ret;
+    VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx];
+    ret = vkEndCommandBuffer(cmd->bufs[cmd->cur_queue_idx]);
+    if (ret != VK_SUCCESS) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to finish command buffer: %s\n",
+               vk_ret2str(ret));
+        unref_exec_ctx_deps(ctx, cmd);
+        return AVERROR_EXTERNAL;
+    }
+    s_info->pCommandBuffers = &cmd->bufs[cmd->cur_queue_idx];
+    s_info->commandBufferCount = 1;
+    ret = vkQueueSubmit(q->queue, 1, s_info, q->fence);
+    if (ret != VK_SUCCESS) {
+        unref_exec_ctx_deps(ctx, cmd);
+        return AVERROR_EXTERNAL;
+    }
+    q->was_synchronous = synchronous;
+    if (synchronous) {
+        AVVulkanDeviceContext *hwctx = ctx->hwctx;
+        vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX);
+        vkResetFences(hwctx->act_dev, 1, &q->fence);
+        unref_exec_ctx_deps(ctx, cmd);
+    } else { /* Rotate queues */
+        cmd->cur_queue_idx = (cmd->cur_queue_idx + 1) % cmd->nb_queues;
+    }
+    return 0;
 static void vulkan_device_free(AVHWDeviceContext *ctx)
@@ -937,7 +1105,8 @@ if (n >= queue_num) {
         p->qfs[p->num_qfs++] = hwctx->queue_family_comp_index;
     /* Create exec context - if there's something invalid this will error out */
-    err = create_exec_ctx(ctx, &p->cmd, hwctx->queue_family_tx_index);
+    err = create_exec_ctx(ctx, &p->cmd, hwctx->queue_family_tx_index,
+                          GET_QUEUE_COUNT(hwctx, 0, 0, 1));
     if (err)
         return err;
@@ -1257,26 +1426,17 @@ enum PrepMode {
 static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx,
                          AVVkFrame *frame, enum PrepMode pmode)
-    VkResult ret;
+    int err;
     uint32_t dst_qf;
     VkImageLayout new_layout;
     VkAccessFlags new_access;
     AVHWDeviceContext *ctx = hwfc->device_ctx;
-    AVVulkanDeviceContext *hwctx = ctx->hwctx;
     const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
     VkImageMemoryBarrier img_bar[AV_NUM_DATA_POINTERS] = { 0 };
-    VkCommandBufferBeginInfo cmd_start = {
-    };
     VkSubmitInfo s_info = {
         .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-        .commandBufferCount   = 1,
-        .pCommandBuffers      = &ectx->buf,
         .pSignalSemaphores    = frame->sem,
         .signalSemaphoreCount = planes,
@@ -1306,9 +1466,8 @@ static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx,
-    ret = vkBeginCommandBuffer(ectx->buf, &cmd_start);
-    if (ret != VK_SUCCESS)
-        return AVERROR_EXTERNAL;
+    if ((err = wait_start_exec_ctx(ctx, ectx)))
+        return err;
     /* Change the image layout to something more optimal for writes.
      * This also signals the newly created semaphore, making it usable
@@ -1330,23 +1489,12 @@ static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx,
         frame->access[i] = img_bar[i].dstAccessMask;
-    vkCmdPipelineBarrier(ectx->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                         VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
-                         0, NULL, 0, NULL, planes, img_bar);
-    ret = vkEndCommandBuffer(ectx->buf);
-    if (ret != VK_SUCCESS)
-        return AVERROR_EXTERNAL;
-    ret = vkQueueSubmit(ectx->queue, 1, &s_info, ectx->fence);
-    if (ret != VK_SUCCESS) {
-        return AVERROR_EXTERNAL;
-    } else {
-        vkWaitForFences(hwctx->act_dev, 1, &ectx->fence, VK_TRUE, UINT64_MAX);
-        vkResetFences(hwctx->act_dev, 1, &ectx->fence);
-    }
+    vkCmdPipelineBarrier(get_buf_exec_ctx(ctx, ectx),
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TRANSFER_BIT,
+                         0, 0, NULL, 0, NULL, planes, img_bar);
-    return 0;
+    return submit_exec_ctx(ctx, ectx, &s_info, 0);
 static int create_frame(AVHWFramesContext *hwfc, AVVkFrame **frame,
@@ -1559,7 +1707,8 @@ static int vulkan_frames_init(AVHWFramesContext *hwfc)
     hwctx->usage |= DEFAULT_USAGE_FLAGS;
     err = create_exec_ctx(hwfc->device_ctx, &fp->cmd,
-                          dev_hwctx->queue_family_tx_index);
+                          dev_hwctx->queue_family_tx_index,
+                          GET_QUEUE_COUNT(dev_hwctx, 0, 0, 1));
     if (err)
         return err;
@@ -2620,12 +2769,12 @@ static int unmap_buffers(AVHWDeviceContext *ctx, AVBufferRef **bufs,
     return err;
-static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame,
+static int transfer_image_buf(AVHWDeviceContext *ctx, const AVFrame *f,
                               AVBufferRef **bufs, const int *buf_stride, int w,
                               int h, enum AVPixelFormat pix_fmt, int to_buf)
-    VkResult ret;
-    AVVulkanDeviceContext *hwctx = ctx->hwctx;
+    int err;
+    AVVkFrame *frame = (AVVkFrame *)f->data[0];
     VulkanDevicePriv *s = ctx->internal->priv;
     int bar_num = 0;
@@ -2634,17 +2783,11 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame,
     const int planes = av_pix_fmt_count_planes(pix_fmt);
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
-    VkCommandBufferBeginInfo cmd_start = {
-    };
     VkImageMemoryBarrier img_bar[AV_NUM_DATA_POINTERS] = { 0 };
+    VkCommandBuffer cmd_buf = get_buf_exec_ctx(ctx, &s->cmd);
     VkSubmitInfo s_info = {
         .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-        .commandBufferCount   = 1,
-        .pCommandBuffers      = &s->cmd.buf,
         .pSignalSemaphores    = frame->sem,
         .pWaitSemaphores      = frame->sem,
         .pWaitDstStageMask    = sem_wait_dst,
@@ -2652,12 +2795,8 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame,
         .waitSemaphoreCount   = planes,
-    ret = vkBeginCommandBuffer(s->cmd.buf, &cmd_start);
-    if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to init command buffer: %s\n",
-               vk_ret2str(ret));
-        return AVERROR_EXTERNAL;
-    }
+    if ((err = wait_start_exec_ctx(ctx, &s->cmd)))
+        return err;
     /* Change the image layout to something more optimal for transfers */
     for (int i = 0; i < planes; i++) {
@@ -2691,7 +2830,7 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame,
     if (bar_num)
-        vkCmdPipelineBarrier(s->cmd.buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+        vkCmdPipelineBarrier(cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                              VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
                              0, NULL, 0, NULL, bar_num, img_bar);
@@ -2715,33 +2854,33 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame,
         if (to_buf)
-            vkCmdCopyImageToBuffer(s->cmd.buf, frame->img[i], frame->layout[i],
+            vkCmdCopyImageToBuffer(cmd_buf, frame->img[i], frame->layout[i],
                                    vkbuf->buf, 1, &buf_reg);
-            vkCmdCopyBufferToImage(s->cmd.buf, vkbuf->buf, frame->img[i],
+            vkCmdCopyBufferToImage(cmd_buf, vkbuf->buf, frame->img[i],
                                    frame->layout[i], 1, &buf_reg);
-    ret = vkEndCommandBuffer(s->cmd.buf);
-    if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to finish command buffer: %s\n",
-               vk_ret2str(ret));
-        return AVERROR_EXTERNAL;
-    }
-    /* Wait for the download/upload to finish if uploading, otherwise the
-     * semaphore will take care of synchronization when uploading */
-    ret = vkQueueSubmit(s->cmd.queue, 1, &s_info, s->cmd.fence);
-    if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to submit command buffer: %s\n",
-               vk_ret2str(ret));
-        return AVERROR_EXTERNAL;
+    /* When uploading, do this asynchronously if the source is refcounted by
+     * keeping the buffers as a submission dependency.
+     * The hwcontext is guaranteed to not be freed until all frames are freed
+     * in the frames_unint function.
+     * When downloading to buffer, do this synchronously and wait for the
+     * queue submission to finish executing */
+    if (!to_buf) {
+        int ref;
+        for (ref = 0; ref < AV_NUM_DATA_POINTERS; ref++) {
+            if (!f->buf[ref])
+                break;
+            if ((err = add_buf_dep_exec_ctx(hwfc, &s->cmd, &f->buf[ref], 1)))
+                return err;
+        }
+        if (ref && (err = add_buf_dep_exec_ctx(hwfc, &s->cmd, bufs, planes)))
+            return err;
+        return submit_exec_ctx(hwfc, &s->cmd, &s_info, !ref);
     } else {
-        vkWaitForFences(hwctx->act_dev, 1, &s->cmd.fence, VK_TRUE, UINT64_MAX);
-        vkResetFences(hwctx->act_dev, 1, &s->cmd.fence);
+        return submit_exec_ctx(hwfc, &s->cmd, &s_info,    1);
-    return 0;
 /* Technically we can use VK_EXT_external_memory_host to upload and download,
@@ -2778,11 +2917,11 @@ static int vulkan_transfer_data_from_mem(AVHWFramesContext *hwfc, AVFrame *dst,
         err = vulkan_map_frame_to_mem(hwfc, map, dst, AV_HWFRAME_MAP_WRITE);
         if (err)
-            goto end;
+            return err;
         err = av_frame_copy(map, src);
-        goto end;
+        return err;
     /* Create buffers */
@@ -2809,7 +2948,7 @@ static int vulkan_transfer_data_from_mem(AVHWFramesContext *hwfc, AVFrame *dst,
         goto end;
     /* Copy buffers to image */
-    err = transfer_image_buf(dev_ctx, f, bufs, tmp.linesize,
+    err = transfer_image_buf(dev_ctx, dst, bufs, tmp.linesize,
                              src->width, src->height, src->format, 0);
@@ -2949,10 +3088,12 @@ static int vulkan_transfer_data_to_mem(AVHWFramesContext *hwfc, AVFrame *dst,
         err = create_buf(dev_ctx, &bufs[i], p_height,
                          &tmp.linesize[i], VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        if (err)
+            goto end;
     /* Copy image to buffer */
-    if ((err = transfer_image_buf(dev_ctx, f, bufs, tmp.linesize,
+    if ((err = transfer_image_buf(dev_ctx, src, bufs, tmp.linesize,
                                   dst->width, dst->height, dst->format, 1)))
         goto end;

More information about the ffmpeg-cvslog mailing list