[FFmpeg-devel] [PATCH 2/2] ffv1enc_vulkan: add support for RCT coefficient searching

Lynne dev at lynne.ee
Wed Nov 20 09:27:02 EET 2024


---
 libavcodec/ffv1enc_vulkan.c                | 268 ++++++++++++++++++++-
 libavcodec/vulkan/Makefile                 |   4 +-
 libavcodec/vulkan/ffv1_enc_rct_search.comp | 139 +++++++++++
 libavcodec/vulkan/ffv1_enc_setup.comp      |  16 ++
 4 files changed, 422 insertions(+), 5 deletions(-)
 create mode 100644 libavcodec/vulkan/ffv1_enc_rct_search.comp

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 3c1db9fd14..55b17f9784 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -36,6 +36,9 @@
 #define LG_ALIGN_W 32
 #define LG_ALIGN_H 32
 
+/* Level 4 and higher */
+#define RCT_MODEMAP_FMT AV_PIX_FMT_RGBA128
+
 typedef struct VulkanEncodeFFv1Context {
     FFV1Context ctx;
 
@@ -48,6 +51,7 @@ typedef struct VulkanEncodeFFv1Context {
 
     FFVulkanShader setup;
     FFVulkanShader reset;
+    FFVulkanShader rct_search;
     FFVulkanShader rct;
     FFVulkanShader enc;
 
@@ -73,6 +77,9 @@ typedef struct VulkanEncodeFFv1Context {
     /* Intermediate frame pool */
     AVBufferRef *intermediate_frames_ref;
 
+    /* Frame pool for RCT mode images */
+    AVBufferRef *rct_mode_frames_ref;
+
     /* Representation mode */
     enum FFVkShaderRepFormat rep_fmt;
 
@@ -92,6 +99,7 @@ extern const char *ff_source_ffv1_common_comp;
 extern const char *ff_source_ffv1_reset_comp;
 extern const char *ff_source_ffv1_enc_common_comp;
 extern const char *ff_source_ffv1_enc_rct_comp;
+extern const char *ff_source_ffv1_enc_rct_search_comp;
 extern const char *ff_source_ffv1_enc_vlc_comp;
 extern const char *ff_source_ffv1_enc_ac_comp;
 extern const char *ff_source_ffv1_enc_setup_comp;
@@ -122,6 +130,7 @@ typedef struct FFv1VkParameters {
 
     int32_t sar[2];
     uint32_t chroma_shift[2];
+    int32_t modemap_basis[2];
 
     uint32_t plane_state_size;
     uint32_t context_count;
@@ -154,6 +163,7 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(0,                                                                  );
     GLSLC(1,    ivec2 sar;                                                    );
     GLSLC(1,    uvec2 chroma_shift;                                           );
+    GLSLC(1,    ivec2 modemap_basis;                                          );
     GLSLC(0,                                                                  );
     GLSLC(1,    uint plane_state_size;                                        );
     GLSLC(1,    uint context_count;                                           );
@@ -179,6 +189,83 @@ static void add_push_data(FFVulkanShader *shd)
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 }
 
+static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec,
+                          AVFrame *enc_in, VkImageView *enc_in_views,
+                          AVFrame **rct_modemap_frame, VkImageView *rct_modemap_views,
+                          VkImageMemoryBarrier2 *img_bar, int *nb_img_bar)
+{
+    int err;
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFV1Context *f = &fv->ctx;
+    FFVulkanFunctions *vk = &fv->s.vkfn;
+    AVHWFramesContext *src_hwfc = (AVHWFramesContext *)enc_in->hw_frames_ctx->data;
+    FFv1VkRCTParameters pd;
+
+    /* Create a temporaty frame */
+    *rct_modemap_frame = av_frame_alloc();
+    if (!(*rct_modemap_frame))
+        return AVERROR(ENOMEM);
+
+    RET(av_hwframe_get_buffer(fv->rct_mode_frames_ref,
+                              *rct_modemap_frame, 0));
+    RET(ff_vk_exec_add_dep_frame(&fv->s, exec, *rct_modemap_frame,
+                                 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+    RET(ff_vk_create_imageviews(&fv->s, exec, rct_modemap_views,
+                                *rct_modemap_frame,
+                                FF_VK_REP_UINT));
+    ff_vk_frame_barrier(&fv->s, exec, *rct_modemap_frame, img_bar, nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_SHADER_WRITE_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+
+    /* Update descriptors */
+    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct_search,
+                                  enc_in, enc_in_views,
+                                  1, 0,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct_search,
+                                  *rct_modemap_frame, rct_modemap_views,
+                                  1, 1,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pImageMemoryBarriers = img_bar,
+            .imageMemoryBarrierCount = *nb_img_bar,
+    });
+
+    /* Run the shader */
+    ff_vk_exec_bind_shader(&fv->s, exec, &fv->rct_search);
+    pd = (FFv1VkRCTParameters) {
+        .offset = 1 << f->bits_per_raw_sample,
+        .planar_rgb = ff_vk_mt_is_np_rgb(src_hwfc->sw_format) &&
+                      (ff_vk_count_images((AVVkFrame *)enc_in->data[0]) > 1),
+        .transparency = f->transparency,
+    };
+    ff_vk_shader_update_push_const(&fv->s, exec, &fv->rct_search,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(pd), &pd);
+
+    vk->CmdDispatch(exec->buf,
+                    (*rct_modemap_frame)->width,
+                    (*rct_modemap_frame)->height, 1);
+
+    ff_vk_frame_barrier(&fv->s, exec, *rct_modemap_frame, img_bar, nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_SHADER_READ_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+
+fail:
+    return err;
+}
+
 static int run_rct(AVCodecContext *avctx, FFVkExecContext *exec,
                    AVFrame *enc_in, VkImageView *enc_in_views,
                    AVFrame **intermediate_frame, VkImageView *intermediate_views,
@@ -285,6 +372,7 @@ static int vulkan_encode_ffv1_frame(AVCodecContext *avctx, AVPacket *pkt,
     FFv1VkParameters pd;
 
     AVFrame *intermediate_frame = NULL;
+    AVFrame *rct_modemap_frame = NULL;
 
     /* Temporary data */
     size_t tmp_data_size;
@@ -317,6 +405,7 @@ static int vulkan_encode_ffv1_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     VkImageView in_views[AV_NUM_DATA_POINTERS];
     VkImageView intermediate_views[AV_NUM_DATA_POINTERS];
+    VkImageView rct_modemap_views[AV_NUM_DATA_POINTERS];
 
     AVFrame *enc_in = (AVFrame *)pict;
     VkImageView *enc_in_views = in_views;
@@ -475,6 +564,19 @@ static int vulkan_encode_ffv1_frame(AVCodecContext *avctx, AVPacket *pkt,
         };
     }
 
+    if (fv->is_rgb && f->version >= 4) {
+        RET(run_rct_search(avctx, exec,
+                           enc_in, enc_in_views,
+                           &rct_modemap_frame, rct_modemap_views,
+                           img_bar, &nb_img_bar));
+
+        ff_vk_shader_update_img_array(&fv->s, exec, &fv->setup,
+                                      rct_modemap_frame, rct_modemap_views,
+                                      1, 2,
+                                      VK_IMAGE_LAYOUT_GENERAL,
+                                      VK_NULL_HANDLE);
+    }
+
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pImageMemoryBarriers = img_bar,
@@ -501,6 +603,8 @@ static int vulkan_encode_ffv1_frame(AVCodecContext *avctx, AVPacket *pkt,
         .sar[1] = pict->sample_aspect_ratio.den,
         .chroma_shift[0] = f->chroma_h_shift,
         .chroma_shift[1] = f->chroma_v_shift,
+        .modemap_basis[0] = fv->rct_search.lg_size[0],
+        .modemap_basis[1] = fv->rct_search.lg_size[1],
         .plane_state_size = plane_state_size,
         .context_count = context_count,
         .crcref = f->crcref,
@@ -652,6 +756,7 @@ static int vulkan_encode_ffv1_frame(AVCodecContext *avctx, AVPacket *pkt,
     /* We need the encoded data immediately */
     ff_vk_exec_wait(&fv->s, exec);
     av_frame_free(&intermediate_frame);
+    av_frame_free(&rct_modemap_frame);
 
     /* Invalidate slice/output data if needed */
     if (!(results_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
@@ -741,6 +846,7 @@ fail:
     /* Frames added as a dep are always referenced, so we only need to
      * clean this up. */
     av_frame_free(&intermediate_frame);
+    av_frame_free(&rct_modemap_frame);
 
     return 0;
 }
@@ -752,6 +858,10 @@ static int init_indirect(AVCodecContext *avctx, enum AVPixelFormat sw_format)
     AVHWFramesContext *frames_ctx;
     AVVulkanFramesContext *vk_frames;
 
+    int subgroup_size = fv->s.props_11.subgroupSize;
+    int lg_rows = fv->s.props.properties.limits.maxComputeWorkGroupInvocations /
+                  subgroup_size;
+
     fv->intermediate_frames_ref = av_hwframe_ctx_alloc(fv->s.device_ref);
     if (!fv->intermediate_frames_ref)
         return AVERROR(ENOMEM);
@@ -759,8 +869,8 @@ static int init_indirect(AVCodecContext *avctx, enum AVPixelFormat sw_format)
     frames_ctx = (AVHWFramesContext *)fv->intermediate_frames_ref->data;
     frames_ctx->format    = AV_PIX_FMT_VULKAN;
     frames_ctx->sw_format = sw_format;
-    frames_ctx->width     = FFALIGN(fv->s.frames->width, 32);
-    frames_ctx->height    = FFALIGN(fv->s.frames->height, 32);
+    frames_ctx->width     = FFALIGN(fv->s.frames->width, FFMAX(subgroup_size, 32));
+    frames_ctx->height    = FFALIGN(fv->s.frames->height, FFMAX(lg_rows, 32));
 
     vk_frames = frames_ctx->hwctx;
     vk_frames->tiling    = VK_IMAGE_TILING_OPTIMAL;
@@ -826,6 +936,39 @@ end:
     return fmt;
 }
 
+static int init_modemap(AVCodecContext *avctx, enum AVPixelFormat sw_format,
+                        int lg_size0, int lg_size1)
+{
+    int err;
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    AVHWFramesContext *frames_ctx;
+    AVVulkanFramesContext *vk_frames;
+
+    fv->rct_mode_frames_ref = av_hwframe_ctx_alloc(fv->s.device_ref);
+    if (!fv->rct_mode_frames_ref)
+        return AVERROR(ENOMEM);
+
+    frames_ctx = (AVHWFramesContext *)fv->rct_mode_frames_ref->data;
+    frames_ctx->format    = AV_PIX_FMT_VULKAN;
+    frames_ctx->sw_format = sw_format;
+    frames_ctx->width     = (FFALIGN(fv->s.frames->width, lg_size0)/lg_size0) + 20;
+    frames_ctx->height    = (FFALIGN(fv->s.frames->height, lg_size1)/lg_size1) + 20;
+
+    vk_frames = frames_ctx->hwctx;
+    vk_frames->tiling    = VK_IMAGE_TILING_OPTIMAL;
+    vk_frames->usage     = VK_IMAGE_USAGE_STORAGE_BIT;
+
+    err = av_hwframe_ctx_init(fv->rct_mode_frames_ref);
+    if (err < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to initialize modemap pool with format %s: %s\n",
+               av_get_pix_fmt_name(sw_format), av_err2str(err));
+        av_buffer_unref(&fv->rct_mode_frames_ref);
+        return err;
+    }
+
+    return 0;
+}
+
 static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd)
 {
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
@@ -912,8 +1055,18 @@ static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
             .mem_quali  = "readonly",
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
+        {
+            .name       = "modemap",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .dimensions = 2,
+            .mem_layout = ff_vk_shader_rep_fmt(RCT_MODEMAP_FMT,
+                                               FF_VK_REP_UINT),
+            .elems      = av_pix_fmt_count_planes(RCT_MODEMAP_FMT),
+            .mem_quali  = "readonly",
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
     };
-    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0));
 
     add_push_data(shd);
 
@@ -1013,6 +1166,105 @@ fail:
     return err;
 }
 
+static int init_rct_search_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
+{
+    int err;
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFVulkanShader *shd = &fv->rct_search;
+    FFVulkanDescriptorSetBinding *desc_set;
+    int subgroup_size = fv->s.props_11.subgroupSize;
+    int lg_rows = fv->s.props.properties.limits.maxComputeWorkGroupInvocations /
+                  subgroup_size;
+
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+
+    RET(init_modemap(avctx, RCT_MODEMAP_FMT, subgroup_size, lg_rows));
+
+    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_rct_search",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_null_initializer",
+                                             "GL_KHR_shader_subgroup_basic",
+                                             "GL_KHR_shader_subgroup_arithmetic",
+                                             "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2" }, 5,
+                          subgroup_size, lg_rows, 1,
+                          0));
+
+    av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
+    av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS);
+    av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE);
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "rangecoder_static_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "uint8_t zero_one_state[512];",
+        },
+        {
+            .name        = "quant_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]"
+                           "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 0));
+
+    define_shared_code(avctx, shd);
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name       = "src",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .dimensions = 2,
+            .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format,
+                                               fv->rep_fmt),
+            .elems      = av_pix_fmt_count_planes(fv->s.frames->sw_format),
+            .mem_quali  = "readonly",
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+        {
+            .name       = "modemap",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .dimensions = 2,
+            .mem_layout = ff_vk_shader_rep_fmt(RCT_MODEMAP_FMT,
+                                               FF_VK_REP_UINT),
+            .elems      = av_pix_fmt_count_planes(RCT_MODEMAP_FMT),
+            .mem_quali  = "writeonly",
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 0, 0));
+
+    GLSLC(0, layout(push_constant, scalar) uniform pushConstants {             );
+    GLSLC(1,    int offset;                                                    );
+    GLSLC(1,    uint8_t planar_rgb;                                            );
+    GLSLC(1,    uint8_t transparency;                                          );
+    GLSLC(1,    uint8_t padding[2];                                            );
+    GLSLC(0, };                                                                );
+    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkRCTParameters),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    GLSLD(ff_source_ffv1_enc_rct_search_comp);
+
+    RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
+                            &spv_opaque));
+    RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd));
+
+fail:
+    if (spv_opaque)
+        spv->free_shader(spv, &spv_opaque);
+
+    return err;
+}
+
 static int init_rct_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
 {
     int err;
@@ -1506,6 +1758,14 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
 
     /* Init RCT shader */
     if (fv->is_rgb) {
+        if (f->version >= 4) {
+            err = init_rct_search_shader(avctx, spv);
+            if (err < 0) {
+                spv->uninit(&spv);
+                return err;
+            }
+        }
+
         err = init_rct_shader(avctx, spv);
         if (err < 0) {
             spv->uninit(&spv);
@@ -1548,9 +1808,11 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
 
     ff_vk_shader_free(&fv->s, &fv->enc);
     ff_vk_shader_free(&fv->s, &fv->rct);
+    ff_vk_shader_free(&fv->s, &fv->rct_search);
     ff_vk_shader_free(&fv->s, &fv->reset);
     ff_vk_shader_free(&fv->s, &fv->setup);
 
+    av_buffer_unref(&fv->rct_mode_frames_ref);
     av_buffer_unref(&fv->intermediate_frames_ref);
 
     av_buffer_pool_uninit(&fv->results_data_pool);
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index 351332ee44..6b6eedda4d 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -6,8 +6,8 @@ clean::
 OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
 					vulkan/rangecoder.o vulkan/ffv1_vlc.o \
 					vulkan/ffv1_common.o vulkan/ffv1_reset.o \
-					vulkan/ffv1_enc_common.o \
-					vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \
+					vulkan/ffv1_enc_common.o vulkan/ffv1_enc_setup.o \
+					vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_rct_search.o \
 					vulkan/ffv1_enc_vlc.o vulkan/ffv1_enc_ac.o \
 					vulkan/ffv1_enc.o vulkan/ffv1_enc_rgb.o
 
diff --git a/libavcodec/vulkan/ffv1_enc_rct_search.comp b/libavcodec/vulkan/ffv1_enc_rct_search.comp
new file mode 100644
index 0000000000..ad251b8588
--- /dev/null
+++ b/libavcodec/vulkan/ffv1_enc_rct_search.comp
@@ -0,0 +1,139 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2024 Lynne <dev at lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ivec4 load_components(void)
+{
+    const ivec2 pos = ivec2(gl_GlobalInvocationID);
+
+    if (planar_rgb == 0)
+        return ivec4(imageLoad(src[0], pos));
+
+    ivec4 pix;
+    for (int i = 0; i < (3 + transparency); i++)
+        pix[i] = int(imageLoad(src[i], pos)[0]);
+
+    /* Swizzle out the difference */
+    if (transparency > 0)
+        return pix.brga;
+    return pix.bgra;
+}
+
+#define NUM_CHECKS 15
+const ivec2 rct_y_coeff[] = {
+    ivec2(0, 0), //      4G
+
+    ivec2(0, 1), //      3G +  B
+    ivec2(1, 0), //  R + 3G
+    ivec2(1, 1), //  R + 2G + B
+
+    ivec2(0, 2), //      2G + 2B
+    ivec2(2, 0), // 2R + 2G
+    ivec2(2, 2), // 2R      + 2B
+
+    ivec2(0, 3), //      1G + 3B
+    ivec2(3, 0), // 3R + 1G
+
+    ivec2(0, 4), //           4B
+    ivec2(4, 0), // 4R
+
+    ivec2(1, 2), //  R +  G + 2B
+    ivec2(2, 1), // 2R +  G +  B
+
+    ivec2(3, 1), // 3R      +  B
+    ivec2(1, 3), //  R      + 3B
+};
+
+shared ivec4 pix_buf[gl_WorkGroupSize.x + 1][gl_WorkGroupSize.y + 1] = { };
+
+ivec4 transform_sample(ivec4 pix, ivec2 rct_coef)
+{
+    pix.b -= pix.g;
+    pix.r -= pix.g;
+    pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
+    pix.b += offset;
+    pix.r += offset;
+    return pix;
+}
+
+uint get_dist(ivec4 cur)
+{
+    ivec4 LL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 1];
+    ivec4 TL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 0];
+    ivec4 TT = pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 0];
+
+    ivec4 pred = ivec4(predict(LL.r, ivec2(TL.r, TT.r)),
+                       predict(LL.g, ivec2(TL.g, TT.g)),
+                       predict(LL.b, ivec2(TL.b, TT.b)),
+                       predict(LL.a, ivec2(TL.a, TT.a)));
+
+    uvec4 c = abs(cur - pred);
+    return c.r + c.g + c.b + c.a;
+}
+
+shared uint score_cols[gl_WorkGroupSize.y] = { };
+
+void coeff_rating(void)
+{
+    ivec4 pix = load_components();
+    uint min_sum = 0xFFFFFFFF;
+    int best_mode = 1;
+
+    for (int i = 0; i < NUM_CHECKS; i++) {
+        ivec4 tx_pix = transform_sample(pix, rct_y_coeff[i]);
+        pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] = tx_pix;
+
+        memoryBarrierShared();
+
+        uint dist = get_dist(tx_pix);
+
+        /* Sum from all columns */
+        uint col_sum = subgroupAdd(dist);
+
+        if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0)
+            score_cols[gl_LocalInvocationID.y] = col_sum;
+
+        memoryBarrierShared();
+
+        /* Sum row-wise */
+        uint row_sum = 0;
+        for (uint j = gl_LocalInvocationID.x; j < gl_WorkGroupSize.y; j += gl_WorkGroupSize.x)
+            row_sum += score_cols[j];
+
+        uint block_sum = subgroupAdd(row_sum);
+        if (block_sum < min_sum) {
+            min_sum = block_sum;
+            best_mode = i;
+        }
+    }
+
+    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0)
+        imageStore(modemap[0], ivec2(gl_WorkGroupID),
+                   uvec4(rct_y_coeff[best_mode].x,
+                         rct_y_coeff[best_mode].y,
+                         min_sum,
+                         0));
+}
+
+void main(void)
+{
+    coeff_rating();
+}
diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp b/libavcodec/vulkan/ffv1_enc_setup.comp
index b861e25f74..d9bc2b453f 100644
--- a/libavcodec/vulkan/ffv1_enc_setup.comp
+++ b/libavcodec/vulkan/ffv1_enc_setup.comp
@@ -53,6 +53,22 @@ void init_slice(out SliceContext sc, const uint slice_idx)
     sc.slice_rct_coef = ivec2(1, 1);
     sc.slice_coding_mode = int(force_pcm == 1);
 
+    if (version >= 4) {
+        ivec2 modemap_pos = sc.slice_pos / modemap_basis;
+        ivec2 modemap_end = (sc.slice_pos + sc.slice_dim) / modemap_basis;
+
+        /* Pick the lowest one amongst all blocks within the image */
+        uvec4 res = uvec4(1, 1, 0xFFFFFFFF, 0);
+        for (; modemap_pos.y < modemap_end.y; modemap_pos.y++) {
+            for (; modemap_pos.x < modemap_end.x; modemap_pos.x++) {
+                uvec4 tmp = imageLoad(modemap[0], modemap_pos);
+                if (tmp.z < res.z)
+                    res = tmp;
+            }
+        }
+        sc.slice_rct_coef = ivec2(res.xy);
+    }
+
     rac_init(sc.c,
              OFFBUF(u8buf, out_data, slice_idx * slice_size_max),
              slice_size_max);
-- 
2.45.2.753.g447d99e1c3b


More information about the ffmpeg-devel mailing list