[FFmpeg-devel] [PATCH] avcodec/vc2enc: Initial vulkan VC2 encoder

Lynne dev at lynne.ee
Sat Nov 30 03:35:16 EET 2024


On 30/11/2024 05:09, IndecisiveTurtle wrote:

> Implements a Vulkan based dirac encoder. Supports Haar and Legall wavelets and should work with all wavelet depths.
>
> Performance wise, encoding a 1080p 1-minute video is performed in about 2.5 minutes with the cpu encoder running on my Ryzen 5 4600H, while it takes about 30 seconds on my NVIDIA GTX 1650
>
> Haar shader has a subgroup optimized variant that applies when configured wavelet depth allows it
>
> lavapipe seems to be bugged for some reason, after a bunch of debugging I'm not quite sure if it's a bug here or in lavapipe. But people probably dont want to use this with a software implementation anyway.
> ---
>   configure                                    |   1 +
>   libavcodec/Makefile                          |   5 +-
>   libavcodec/allcodecs.c                       |   1 +
>   libavcodec/vc2enc.c                          | 501 +-----------
>   libavcodec/vc2enc_common.c                   | 368 +++++++++
>   libavcodec/vc2enc_common.h                   | 279 +++++++
>   libavcodec/vc2enc_vulkan.c                   | 786 +++++++++++++++++++
>   libavcodec/vulkan/common.comp                |   7 +-
>   libavcodec/vulkan/vc2_dwt_haar.comp          |  76 ++
>   libavcodec/vulkan/vc2_dwt_haar_subgroup.comp |  94 +++
>   libavcodec/vulkan/vc2_dwt_hor_legall.comp    |  61 ++
>   libavcodec/vulkan/vc2_dwt_legall.comp        |  74 ++
>   libavcodec/vulkan/vc2_dwt_upload.comp        |  45 ++
>   libavcodec/vulkan/vc2_dwt_ver_legall.comp    |  55 ++
>   libavcodec/vulkan/vc2_encode.comp            | 204 +++++
>   libavcodec/vulkan/vc2_slice_sizes.comp       | 184 +++++
>   16 files changed, 2238 insertions(+), 503 deletions(-)
>   create mode 100644 libavcodec/vc2enc_common.c
>   create mode 100644 libavcodec/vc2enc_common.h
>   create mode 100644 libavcodec/vc2enc_vulkan.c
>   create mode 100644 libavcodec/vulkan/vc2_dwt_haar.comp
>   create mode 100644 libavcodec/vulkan/vc2_dwt_haar_subgroup.comp
>   create mode 100644 libavcodec/vulkan/vc2_dwt_hor_legall.comp
>   create mode 100644 libavcodec/vulkan/vc2_dwt_legall.comp
>   create mode 100644 libavcodec/vulkan/vc2_dwt_upload.comp
>   create mode 100644 libavcodec/vulkan/vc2_dwt_ver_legall.comp
>   create mode 100644 libavcodec/vulkan/vc2_encode.comp
>   create mode 100644 libavcodec/vulkan/vc2_slice_sizes.comp
>
> diff --git a/configure b/configure
> index d7b7b49f92..f2eb31b45e 100755
> --- a/configure
> +++ b/configure
> @@ -3109,6 +3109,7 @@ utvideo_encoder_select="bswapdsp huffman llvidencdsp"
>   vble_decoder_select="llviddsp"
>   vbn_decoder_select="texturedsp"
>   vbn_encoder_select="texturedspenc"
> +vc2_vulkan_encoder_select="vulkan spirv_compiler"
>   vmix_decoder_select="idctdsp"
>   vc1_decoder_select="blockdsp h264qpel intrax8 mpegvideodec qpeldsp vc1dsp"
>   vc1image_decoder_select="vc1_decoder"
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index a6e0e0b55e..d6621d4d29 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -767,7 +767,10 @@ OBJS-$(CONFIG_VC1_CUVID_DECODER)       += cuviddec.o
>   OBJS-$(CONFIG_VC1_MMAL_DECODER)        += mmaldec.o
>   OBJS-$(CONFIG_VC1_QSV_DECODER)         += qsvdec.o
>   OBJS-$(CONFIG_VC1_V4L2M2M_DECODER)     += v4l2_m2m_dec.o
> -OBJS-$(CONFIG_VC2_ENCODER)             += vc2enc.o vc2enc_dwt.o diractab.o
> +OBJS-$(CONFIG_VC2_ENCODER)             += vc2enc.o vc2enc_dwt.o vc2enc_common.o diractab.o
> +OBJS-$(CONFIG_VC2_VULKAN_ENCODER)      += vc2enc_vulkan.o vulkan/vc2_encode.o vulkan/vc2_slice_sizes.o \
> +                                          vulkan/vc2_dwt_hor_legall.o vulkan/vc2_dwt_ver_legall.o \
> +                                          vulkan/vc2_dwt_upload.o vulkan/vc2_dwt_haar.o vulkan/vc2_dwt_haar_subgroup.o


Trailing space on the last line.


Use libavcodec/vulkan/Makefile for all the shader sources.

Otherwise the .c files generated won't be cleaned up properly.


>   OBJS-$(CONFIG_VCR1_DECODER)            += vcr1.o
>   OBJS-$(CONFIG_VMDAUDIO_DECODER)        += vmdaudio.o
>   OBJS-$(CONFIG_VMDVIDEO_DECODER)        += vmdvideo.o
> diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
> index 0b559dfc58..a313f47cd7 100644
> --- a/libavcodec/allcodecs.c
> +++ b/libavcodec/allcodecs.c
> @@ -364,6 +364,7 @@ extern const FFCodec ff_vc1image_decoder;
>   extern const FFCodec ff_vc1_mmal_decoder;
>   extern const FFCodec ff_vc1_qsv_decoder;
>   extern const FFCodec ff_vc1_v4l2m2m_decoder;
> +extern const FFCodec ff_vc2_vulkan_encoder;
>   extern const FFCodec ff_vc2_encoder;
>   extern const FFCodec ff_vcr1_decoder;
>   extern const FFCodec ff_vmdvideo_decoder;
> diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c
> index b82370a753..54fcbe5f83 100644
> --- a/libavcodec/vc2enc.c
> +++ b/libavcodec/vc2enc.c
> @@ -29,506 +29,7 @@
>   #include "put_bits.h"
>   #include "version.h"
>   
> -#include "vc2enc_dwt.h"
> -#include "diractab.h"
> -
> -/* The limited size resolution of each slice forces us to do this */
> -#define SSIZE_ROUND(b) (FFALIGN((b), s->size_scaler) + 4 + s->prefix_bytes)
> -
> -/* Decides the cutoff point in # of slices to distribute the leftover bytes */
> -#define SLICE_REDIST_TOTAL 150
> -
> -typedef struct VC2BaseVideoFormat {
> -    enum AVPixelFormat pix_fmt;
> -    AVRational time_base;
> -    int width, height;
> -    uint8_t interlaced, level;
> -    char name[13];
> -} VC2BaseVideoFormat;
> -
> -static const VC2BaseVideoFormat base_video_fmts[] = {
> -    { 0 }, /* Custom format, here just to make indexing equal to base_vf */
> -    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  176,  120, 0, 1,     "QSIF525" },
> -    { AV_PIX_FMT_YUV420P,   {    2,    25 },  176,  144, 0, 1,     "QCIF"    },
> -    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  352,  240, 0, 1,     "SIF525"  },
> -    { AV_PIX_FMT_YUV420P,   {    2,    25 },  352,  288, 0, 1,     "CIF"     },
> -    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  704,  480, 0, 1,     "4SIF525" },
> -    { AV_PIX_FMT_YUV420P,   {    2,    25 },  704,  576, 0, 1,     "4CIF"    },
> -
> -    { AV_PIX_FMT_YUV422P10, { 1001, 30000 },  720,  480, 1, 2,   "SD480I-60" },
> -    { AV_PIX_FMT_YUV422P10, {    1,    25 },  720,  576, 1, 2,   "SD576I-50" },
> -
> -    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1280,  720, 0, 3,  "HD720P-60"  },
> -    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 1280,  720, 0, 3,  "HD720P-50"  },
> -    { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 1920, 1080, 1, 3,  "HD1080I-60" },
> -    { AV_PIX_FMT_YUV422P10, {    1,    25 }, 1920, 1080, 1, 3,  "HD1080I-50" },
> -    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1920, 1080, 0, 3,  "HD1080P-60" },
> -    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 1920, 1080, 0, 3,  "HD1080P-50" },
> -
> -    { AV_PIX_FMT_YUV444P12, {    1,    24 }, 2048, 1080, 0, 4,        "DC2K" },
> -    { AV_PIX_FMT_YUV444P12, {    1,    24 }, 4096, 2160, 0, 5,        "DC4K" },
> -
> -    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 3840, 2160, 0, 6, "UHDTV 4K-60" },
> -    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 3840, 2160, 0, 6, "UHDTV 4K-50" },
> -
> -    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 7680, 4320, 0, 7, "UHDTV 8K-60" },
> -    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 7680, 4320, 0, 7, "UHDTV 8K-50" },
> -
> -    { AV_PIX_FMT_YUV422P10, { 1001, 24000 }, 1920, 1080, 0, 3,  "HD1080P-24" },
> -    { AV_PIX_FMT_YUV422P10, { 1001, 30000 },  720,  486, 1, 2,  "SD Pro486"  },
> -};
> -static const int base_video_fmts_len = FF_ARRAY_ELEMS(base_video_fmts);
> -
> -enum VC2_QM {
> -    VC2_QM_DEF = 0,
> -    VC2_QM_COL,
> -    VC2_QM_FLAT,
> -
> -    VC2_QM_NB
> -};
> -
> -typedef struct SubBand {
> -    dwtcoef *buf;
> -    ptrdiff_t stride;
> -    int width;
> -    int height;
> -} SubBand;
> -
> -typedef struct Plane {
> -    SubBand band[MAX_DWT_LEVELS][4];
> -    dwtcoef *coef_buf;
> -    int width;
> -    int height;
> -    int dwt_width;
> -    int dwt_height;
> -    ptrdiff_t coef_stride;
> -} Plane;
> -
> -typedef struct SliceArgs {
> -    const struct VC2EncContext *ctx;
> -    union {
> -        int cache[DIRAC_MAX_QUANT_INDEX];
> -        uint8_t *buf;
> -    };
> -    int x;
> -    int y;
> -    int quant_idx;
> -    int bits_ceil;
> -    int bits_floor;
> -    int bytes;
> -} SliceArgs;
> -
> -typedef struct TransformArgs {
> -    const struct VC2EncContext *ctx;
> -    Plane *plane;
> -    const void *idata;
> -    ptrdiff_t istride;
> -    int field;
> -    VC2TransformContext t;
> -} TransformArgs;
> -
> -typedef struct VC2EncContext {
> -    AVClass *av_class;
> -    PutBitContext pb;
> -    Plane plane[3];
> -    AVCodecContext *avctx;
> -    DiracVersionInfo ver;
> -
> -    SliceArgs *slice_args;
> -    TransformArgs transform_args[3];
> -
> -    /* For conversion from unsigned pixel values to signed */
> -    int diff_offset;
> -    int bpp;
> -    int bpp_idx;
> -
> -    /* Picture number */
> -    uint32_t picture_number;
> -
> -    /* Base video format */
> -    int base_vf;
> -    int level;
> -    int profile;
> -
> -    /* Quantization matrix */
> -    uint8_t quant[MAX_DWT_LEVELS][4];
> -    int custom_quant_matrix;
> -
> -    /* Division LUT */
> -    uint32_t qmagic_lut[116][2];
> -
> -    int num_x; /* #slices horizontally */
> -    int num_y; /* #slices vertically */
> -    int prefix_bytes;
> -    int size_scaler;
> -    int chroma_x_shift;
> -    int chroma_y_shift;
> -
> -    /* Rate control stuff */
> -    int frame_max_bytes;
> -    int slice_max_bytes;
> -    int slice_min_bytes;
> -    int q_ceil;
> -    int q_avg;
> -
> -    /* Options */
> -    double tolerance;
> -    int wavelet_idx;
> -    int wavelet_depth;
> -    int strict_compliance;
> -    int slice_height;
> -    int slice_width;
> -    int interlaced;
> -    enum VC2_QM quant_matrix;
> -
> -    /* Parse code state */
> -    uint32_t next_parse_offset;
> -    enum DiracParseCodes last_parse_code;
> -} VC2EncContext;
> -
> -static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
> -{
> -    int i;
> -    int bits = 0;
> -    unsigned topbit = 1, maxval = 1;
> -    uint64_t pbits = 0;
> -
> -    if (!val++) {
> -        put_bits(pb, 1, 1);
> -        return;
> -    }
> -
> -    while (val > maxval) {
> -        topbit <<= 1;
> -        maxval <<= 1;
> -        maxval |=  1;
> -    }
> -
> -    bits = ff_log2(topbit);
> -
> -    for (i = 0; i < bits; i++) {
> -        topbit >>= 1;
> -        av_assert2(pbits <= UINT64_MAX>>3);
> -        pbits <<= 2;
> -        if (val & topbit)
> -            pbits |= 0x1;
> -    }
> -
> -    put_bits64(pb, bits*2 + 1, (pbits << 1) | 1);
> -}
> -
> -static av_always_inline int count_vc2_ue_uint(uint32_t val)
> -{
> -    int topbit = 1, maxval = 1;
> -
> -    if (!val++)
> -        return 1;
> -
> -    while (val > maxval) {
> -        topbit <<= 1;
> -        maxval <<= 1;
> -        maxval |=  1;
> -    }
> -
> -    return ff_log2(topbit)*2 + 1;
> -}
> -
> -/* VC-2 10.4 - parse_info() */
> -static void encode_parse_info(VC2EncContext *s, enum DiracParseCodes pcode)
> -{
> -    uint32_t cur_pos, dist;
> -
> -    align_put_bits(&s->pb);
> -
> -    cur_pos = put_bytes_count(&s->pb, 0);
> -
> -    /* Magic string */
> -    ff_put_string(&s->pb, "BBCD", 0);
> -
> -    /* Parse code */
> -    put_bits(&s->pb, 8, pcode);
> -
> -    /* Next parse offset */
> -    dist = cur_pos - s->next_parse_offset;
> -    AV_WB32(s->pb.buf + s->next_parse_offset + 5, dist);
> -    s->next_parse_offset = cur_pos;
> -    put_bits32(&s->pb, pcode == DIRAC_PCODE_END_SEQ ? 13 : 0);
> -
> -    /* Last parse offset */
> -    put_bits32(&s->pb, s->last_parse_code == DIRAC_PCODE_END_SEQ ? 13 : dist);
> -
> -    s->last_parse_code = pcode;
> -}
> -
> -/* VC-2 11.1 - parse_parameters()
> - * The level dictates what the decoder should expect in terms of resolution
> - * and allows it to quickly reject whatever it can't support. Remember,
> - * this codec kinda targets cheapo FPGAs without much memory. Unfortunately
> - * it also limits us greatly in our choice of formats, hence the flag to disable
> - * strict_compliance */
> -static void encode_parse_params(VC2EncContext *s)
> -{
> -    put_vc2_ue_uint(&s->pb, s->ver.major); /* VC-2 demands this to be 2 */
> -    put_vc2_ue_uint(&s->pb, s->ver.minor); /* ^^ and this to be 0       */
> -    put_vc2_ue_uint(&s->pb, s->profile);   /* 3 to signal HQ profile    */
> -    put_vc2_ue_uint(&s->pb, s->level);     /* 3 - 1080/720, 6 - 4K      */
> -}
> -
> -/* VC-2 11.3 - frame_size() */
> -static void encode_frame_size(VC2EncContext *s)
> -{
> -    put_bits(&s->pb, 1, !s->strict_compliance);
> -    if (!s->strict_compliance) {
> -        AVCodecContext *avctx = s->avctx;
> -        put_vc2_ue_uint(&s->pb, avctx->width);
> -        put_vc2_ue_uint(&s->pb, avctx->height);
> -    }
> -}
> -
> -/* VC-2 11.3.3 - color_diff_sampling_format() */
> -static void encode_sample_fmt(VC2EncContext *s)
> -{
> -    put_bits(&s->pb, 1, !s->strict_compliance);
> -    if (!s->strict_compliance) {
> -        int idx;
> -        if (s->chroma_x_shift == 1 && s->chroma_y_shift == 0)
> -            idx = 1; /* 422 */
> -        else if (s->chroma_x_shift == 1 && s->chroma_y_shift == 1)
> -            idx = 2; /* 420 */
> -        else
> -            idx = 0; /* 444 */
> -        put_vc2_ue_uint(&s->pb, idx);
> -    }
> -}
> -
> -/* VC-2 11.3.4 - scan_format() */
> -static void encode_scan_format(VC2EncContext *s)
> -{
> -    put_bits(&s->pb, 1, !s->strict_compliance);
> -    if (!s->strict_compliance)
> -        put_vc2_ue_uint(&s->pb, s->interlaced);
> -}
> -
> -/* VC-2 11.3.5 - frame_rate() */
> -static void encode_frame_rate(VC2EncContext *s)
> -{
> -    put_bits(&s->pb, 1, !s->strict_compliance);
> -    if (!s->strict_compliance) {
> -        AVCodecContext *avctx = s->avctx;
> -        put_vc2_ue_uint(&s->pb, 0);
> -        put_vc2_ue_uint(&s->pb, avctx->time_base.den);
> -        put_vc2_ue_uint(&s->pb, avctx->time_base.num);
> -    }
> -}
> -
> -/* VC-2 11.3.6 - aspect_ratio() */
> -static void encode_aspect_ratio(VC2EncContext *s)
> -{
> -    put_bits(&s->pb, 1, !s->strict_compliance);
> -    if (!s->strict_compliance) {
> -        AVCodecContext *avctx = s->avctx;
> -        put_vc2_ue_uint(&s->pb, 0);
> -        put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.num);
> -        put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.den);
> -    }
> -}
> -
> -/* VC-2 11.3.7 - clean_area() */
> -static void encode_clean_area(VC2EncContext *s)
> -{
> -    put_bits(&s->pb, 1, 0);
> -}
> -
> -/* VC-2 11.3.8 - signal_range() */
> -static void encode_signal_range(VC2EncContext *s)
> -{
> -    put_bits(&s->pb, 1, !s->strict_compliance);
> -    if (!s->strict_compliance)
> -        put_vc2_ue_uint(&s->pb, s->bpp_idx);
> -}
> -
> -/* VC-2 11.3.9 - color_spec() */
> -static void encode_color_spec(VC2EncContext *s)
> -{
> -    AVCodecContext *avctx = s->avctx;
> -    put_bits(&s->pb, 1, !s->strict_compliance);
> -    if (!s->strict_compliance) {
> -        int val;
> -        put_vc2_ue_uint(&s->pb, 0);
> -
> -        /* primaries */
> -        put_bits(&s->pb, 1, 1);
> -        if (avctx->color_primaries == AVCOL_PRI_BT470BG)
> -            val = 2;
> -        else if (avctx->color_primaries == AVCOL_PRI_SMPTE170M)
> -            val = 1;
> -        else if (avctx->color_primaries == AVCOL_PRI_SMPTE240M)
> -            val = 1;
> -        else
> -            val = 0;
> -        put_vc2_ue_uint(&s->pb, val);
> -
> -        /* color matrix */
> -        put_bits(&s->pb, 1, 1);
> -        if (avctx->colorspace == AVCOL_SPC_RGB)
> -            val = 3;
> -        else if (avctx->colorspace == AVCOL_SPC_YCOCG)
> -            val = 2;
> -        else if (avctx->colorspace == AVCOL_SPC_BT470BG)
> -            val = 1;
> -        else
> -            val = 0;
> -        put_vc2_ue_uint(&s->pb, val);
> -
> -        /* transfer function */
> -        put_bits(&s->pb, 1, 1);
> -        if (avctx->color_trc == AVCOL_TRC_LINEAR)
> -            val = 2;
> -        else if (avctx->color_trc == AVCOL_TRC_BT1361_ECG)
> -            val = 1;
> -        else
> -            val = 0;
> -        put_vc2_ue_uint(&s->pb, val);
> -    }
> -}
> -
> -/* VC-2 11.3 - source_parameters() */
> -static void encode_source_params(VC2EncContext *s)
> -{
> -    encode_frame_size(s);
> -    encode_sample_fmt(s);
> -    encode_scan_format(s);
> -    encode_frame_rate(s);
> -    encode_aspect_ratio(s);
> -    encode_clean_area(s);
> -    encode_signal_range(s);
> -    encode_color_spec(s);
> -}
> -
> -/* VC-2 11 - sequence_header() */
> -static void encode_seq_header(VC2EncContext *s)
> -{
> -    align_put_bits(&s->pb);
> -    encode_parse_params(s);
> -    put_vc2_ue_uint(&s->pb, s->base_vf);
> -    encode_source_params(s);
> -    put_vc2_ue_uint(&s->pb, s->interlaced); /* Frames or fields coding */
> -}
> -
> -/* VC-2 12.1 - picture_header() */
> -static void encode_picture_header(VC2EncContext *s)
> -{
> -    align_put_bits(&s->pb);
> -    put_bits32(&s->pb, s->picture_number++);
> -}
> -
> -/* VC-2 12.3.4.1 - slice_parameters() */
> -static void encode_slice_params(VC2EncContext *s)
> -{
> -    put_vc2_ue_uint(&s->pb, s->num_x);
> -    put_vc2_ue_uint(&s->pb, s->num_y);
> -    put_vc2_ue_uint(&s->pb, s->prefix_bytes);
> -    put_vc2_ue_uint(&s->pb, s->size_scaler);
> -}
> -
> -/* 1st idx = LL, second - vertical, third - horizontal, fourth - total */
> -static const uint8_t vc2_qm_col_tab[][4] = {
> -    {20,  9, 15,  4},
> -    { 0,  6,  6,  4},
> -    { 0,  3,  3,  5},
> -    { 0,  3,  5,  1},
> -    { 0, 11, 10, 11}
> -};
> -
> -static const uint8_t vc2_qm_flat_tab[][4] = {
> -    { 0,  0,  0,  0},
> -    { 0,  0,  0,  0},
> -    { 0,  0,  0,  0},
> -    { 0,  0,  0,  0},
> -    { 0,  0,  0,  0}
> -};
> -
> -static void init_quant_matrix(VC2EncContext *s)
> -{
> -    int level, orientation;
> -
> -    if (s->wavelet_depth <= 4 && s->quant_matrix == VC2_QM_DEF) {
> -        s->custom_quant_matrix = 0;
> -        for (level = 0; level < s->wavelet_depth; level++) {
> -            s->quant[level][0] = ff_dirac_default_qmat[s->wavelet_idx][level][0];
> -            s->quant[level][1] = ff_dirac_default_qmat[s->wavelet_idx][level][1];
> -            s->quant[level][2] = ff_dirac_default_qmat[s->wavelet_idx][level][2];
> -            s->quant[level][3] = ff_dirac_default_qmat[s->wavelet_idx][level][3];
> -        }
> -        return;
> -    }
> -
> -    s->custom_quant_matrix = 1;
> -
> -    if (s->quant_matrix == VC2_QM_DEF) {
> -        for (level = 0; level < s->wavelet_depth; level++) {
> -            for (orientation = 0; orientation < 4; orientation++) {
> -                if (level <= 3)
> -                    s->quant[level][orientation] = ff_dirac_default_qmat[s->wavelet_idx][level][orientation];
> -                else
> -                    s->quant[level][orientation] = vc2_qm_col_tab[level][orientation];
> -            }
> -        }
> -    } else if (s->quant_matrix == VC2_QM_COL) {
> -        for (level = 0; level < s->wavelet_depth; level++) {
> -            for (orientation = 0; orientation < 4; orientation++) {
> -                s->quant[level][orientation] = vc2_qm_col_tab[level][orientation];
> -            }
> -        }
> -    } else {
> -        for (level = 0; level < s->wavelet_depth; level++) {
> -            for (orientation = 0; orientation < 4; orientation++) {
> -                s->quant[level][orientation] = vc2_qm_flat_tab[level][orientation];
> -            }
> -        }
> -    }
> -}
> -
> -/* VC-2 12.3.4.2 - quant_matrix() */
> -static void encode_quant_matrix(VC2EncContext *s)
> -{
> -    int level;
> -    put_bits(&s->pb, 1, s->custom_quant_matrix);
> -    if (s->custom_quant_matrix) {
> -        put_vc2_ue_uint(&s->pb, s->quant[0][0]);
> -        for (level = 0; level < s->wavelet_depth; level++) {
> -            put_vc2_ue_uint(&s->pb, s->quant[level][1]);
> -            put_vc2_ue_uint(&s->pb, s->quant[level][2]);
> -            put_vc2_ue_uint(&s->pb, s->quant[level][3]);
> -        }
> -    }
> -}
> -
> -/* VC-2 12.3 - transform_parameters() */
> -static void encode_transform_params(VC2EncContext *s)
> -{
> -    put_vc2_ue_uint(&s->pb, s->wavelet_idx);
> -    put_vc2_ue_uint(&s->pb, s->wavelet_depth);
> -
> -    encode_slice_params(s);
> -    encode_quant_matrix(s);
> -}
> -
> -/* VC-2 12.2 - wavelet_transform() */
> -static void encode_wavelet_transform(VC2EncContext *s)
> -{
> -    encode_transform_params(s);
> -    align_put_bits(&s->pb);
> -}
> -
> -/* VC-2 12 - picture_parse() */
> -static void encode_picture_start(VC2EncContext *s)
> -{
> -    align_put_bits(&s->pb);
> -    encode_picture_header(s);
> -    align_put_bits(&s->pb);
> -    encode_wavelet_transform(s);
> -}
> +#include "vc2enc_common.h"
>   
>   #define QUANT(c, mul, add, shift) (((mul) * (c) + (add)) >> (shift))
>   
> diff --git a/libavcodec/vc2enc_common.c b/libavcodec/vc2enc_common.c
> new file mode 100644
> index 0000000000..3cc59c4b62
> --- /dev/null
> +++ b/libavcodec/vc2enc_common.c
> @@ -0,0 +1,368 @@
> +/*
> + * Copyright (C) 2016 Open Broadcast Systems Ltd.
> + * Author        2016 Rostislav Pehlivanov <atomnuker at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "vc2enc_common.h"
> +
> +void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
> +{
> +    int i;
> +    int bits = 0;
> +    unsigned topbit = 1, maxval = 1;
> +    uint64_t pbits = 0;
> +
> +    if (!val++) {
> +        put_bits(pb, 1, 1);
> +        return;
> +    }
> +
> +    while (val > maxval) {
> +        topbit <<= 1;
> +        maxval <<= 1;
> +        maxval |=  1;
> +    }
> +
> +    bits = ff_log2(topbit);
> +
> +    for (i = 0; i < bits; i++) {
> +        topbit >>= 1;
> +        av_assert2(pbits <= UINT64_MAX>>3);
> +        pbits <<= 2;
> +        if (val & topbit)
> +            pbits |= 0x1;
> +    }
> +
> +    put_bits64(pb, bits*2 + 1, (pbits << 1) | 1);
> +}
> +
> +int count_vc2_ue_uint(uint32_t val)
> +{
> +    int topbit = 1, maxval = 1;
> +
> +    if (!val++)
> +        return 1;
> +
> +    while (val > maxval) {
> +        topbit <<= 1;
> +        maxval <<= 1;
> +        maxval |=  1;
> +    }
> +
> +    return ff_log2(topbit)*2 + 1;
> +}
> +
> +/* VC-2 10.4 - parse_info() */
> +void encode_parse_info(VC2EncContext *s, enum DiracParseCodes pcode)
> +{
> +    uint32_t cur_pos, dist;
> +
> +    align_put_bits(&s->pb);
> +
> +    cur_pos = put_bytes_count(&s->pb, 0);
> +
> +    /* Magic string */
> +    ff_put_string(&s->pb, "BBCD", 0);
> +
> +    /* Parse code */
> +    put_bits(&s->pb, 8, pcode);
> +
> +    /* Next parse offset */
> +    dist = cur_pos - s->next_parse_offset;
> +    AV_WB32(s->pb.buf + s->next_parse_offset + 5, dist);
> +    s->next_parse_offset = cur_pos;
> +    put_bits32(&s->pb, pcode == DIRAC_PCODE_END_SEQ ? 13 : 0);
> +
> +    cur_pos = put_bytes_count(&s->pb, 0);
> +
> +    /* Last parse offset */
> +    put_bits32(&s->pb, s->last_parse_code == DIRAC_PCODE_END_SEQ ? 13 : dist);
> +
> +    s->last_parse_code = pcode;
> +}
> +
> +/* VC-2 11.1 - parse_parameters()
> + * The level dictates what the decoder should expect in terms of resolution
> + * and allows it to quickly reject whatever it can't support. Remember,
> + * this codec kinda targets cheapo FPGAs without much memory. Unfortunately
> + * it also limits us greatly in our choice of formats, hence the flag to disable
> + * strict_compliance */
> +static void encode_parse_params(VC2EncContext *s)
> +{
> +    put_vc2_ue_uint(&s->pb, s->ver.major); /* VC-2 demands this to be 2 */
> +    put_vc2_ue_uint(&s->pb, s->ver.minor); /* ^^ and this to be 0       */
> +    put_vc2_ue_uint(&s->pb, s->profile);   /* 3 to signal HQ profile    */
> +    put_vc2_ue_uint(&s->pb, s->level);     /* 3 - 1080/720, 6 - 4K      */
> +}
> +
> +/* VC-2 11.3 - frame_size() */
> +static void encode_frame_size(VC2EncContext *s)
> +{
> +    put_bits(&s->pb, 1, !s->strict_compliance);
> +    if (!s->strict_compliance) {
> +        AVCodecContext *avctx = s->avctx;
> +        put_vc2_ue_uint(&s->pb, avctx->width);
> +        put_vc2_ue_uint(&s->pb, avctx->height);
> +    }
> +}
> +
> +/* VC-2 11.3.3 - color_diff_sampling_format() */
> +static void encode_sample_fmt(VC2EncContext *s)
> +{
> +    put_bits(&s->pb, 1, !s->strict_compliance);
> +    if (!s->strict_compliance) {
> +        int idx;
> +        if (s->chroma_x_shift == 1 && s->chroma_y_shift == 0)
> +            idx = 1; /* 422 */
> +        else if (s->chroma_x_shift == 1 && s->chroma_y_shift == 1)
> +            idx = 2; /* 420 */
> +        else
> +            idx = 0; /* 444 */
> +        put_vc2_ue_uint(&s->pb, idx);
> +    }
> +}
> +
> +/* VC-2 11.3.4 - scan_format() */
> +static void encode_scan_format(VC2EncContext *s)
> +{
> +    put_bits(&s->pb, 1, !s->strict_compliance);
> +    if (!s->strict_compliance)
> +        put_vc2_ue_uint(&s->pb, s->interlaced);
> +}
> +
> +/* VC-2 11.3.5 - frame_rate() */
> +static void encode_frame_rate(VC2EncContext *s)
> +{
> +    put_bits(&s->pb, 1, !s->strict_compliance);
> +    if (!s->strict_compliance) {
> +        AVCodecContext *avctx = s->avctx;
> +        put_vc2_ue_uint(&s->pb, 0);
> +        put_vc2_ue_uint(&s->pb, avctx->time_base.den);
> +        put_vc2_ue_uint(&s->pb, avctx->time_base.num);
> +    }
> +}
> +
> +/* VC-2 11.3.6 - aspect_ratio() */
> +static void encode_aspect_ratio(VC2EncContext *s)
> +{
> +    put_bits(&s->pb, 1, !s->strict_compliance);
> +    if (!s->strict_compliance) {
> +        AVCodecContext *avctx = s->avctx;
> +        put_vc2_ue_uint(&s->pb, 0);
> +        put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.num);
> +        put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.den);
> +    }
> +}
> +
> +/* VC-2 11.3.7 - clean_area() */
> +static void encode_clean_area(VC2EncContext *s)
> +{
> +    put_bits(&s->pb, 1, 0);
> +}
> +
> +/* VC-2 11.3.8 - signal_range() */
> +static void encode_signal_range(VC2EncContext *s)
> +{
> +    put_bits(&s->pb, 1, !s->strict_compliance);
> +    if (!s->strict_compliance)
> +        put_vc2_ue_uint(&s->pb, s->bpp_idx);
> +}
> +
> +/* VC-2 11.3.9 - color_spec() */
> +static void encode_color_spec(VC2EncContext *s)
> +{
> +    AVCodecContext *avctx = s->avctx;
> +    put_bits(&s->pb, 1, !s->strict_compliance);
> +    if (!s->strict_compliance) {
> +        int val;
> +        put_vc2_ue_uint(&s->pb, 0);
> +
> +        /* primaries */
> +        put_bits(&s->pb, 1, 1);
> +        if (avctx->color_primaries == AVCOL_PRI_BT470BG)
> +            val = 2;
> +        else if (avctx->color_primaries == AVCOL_PRI_SMPTE170M)
> +            val = 1;
> +        else if (avctx->color_primaries == AVCOL_PRI_SMPTE240M)
> +            val = 1;
> +        else
> +            val = 0;
> +        put_vc2_ue_uint(&s->pb, val);
> +
> +        /* color matrix */
> +        put_bits(&s->pb, 1, 1);
> +        if (avctx->colorspace == AVCOL_SPC_RGB)
> +            val = 3;
> +        else if (avctx->colorspace == AVCOL_SPC_YCOCG)
> +            val = 2;
> +        else if (avctx->colorspace == AVCOL_SPC_BT470BG)
> +            val = 1;
> +        else
> +            val = 0;
> +        put_vc2_ue_uint(&s->pb, val);
> +
> +        /* transfer function */
> +        put_bits(&s->pb, 1, 1);
> +        if (avctx->color_trc == AVCOL_TRC_LINEAR)
> +            val = 2;
> +        else if (avctx->color_trc == AVCOL_TRC_BT1361_ECG)
> +            val = 1;
> +        else
> +            val = 0;
> +        put_vc2_ue_uint(&s->pb, val);
> +    }
> +}
> +
> +/* VC-2 11.3 - source_parameters() */
> +static void encode_source_params(VC2EncContext *s)
> +{
> +    encode_frame_size(s);
> +    encode_sample_fmt(s);
> +    encode_scan_format(s);
> +    encode_frame_rate(s);
> +    encode_aspect_ratio(s);
> +    encode_clean_area(s);
> +    encode_signal_range(s);
> +    encode_color_spec(s);
> +}
> +
> +/* VC-2 11 - sequence_header() */
> +void encode_seq_header(VC2EncContext *s)
> +{
> +    align_put_bits(&s->pb);
> +    encode_parse_params(s);
> +    put_vc2_ue_uint(&s->pb, s->base_vf);
> +    encode_source_params(s);
> +    put_vc2_ue_uint(&s->pb, s->interlaced); /* Frames or fields coding */
> +}
> +
> +/* VC-2 12.1 - picture_header() */
> +static void encode_picture_header(VC2EncContext *s)
> +{
> +    align_put_bits(&s->pb);
> +    put_bits32(&s->pb, s->picture_number++);
> +}
> +
> +/* VC-2 12.3.4.1 - slice_parameters() */
> +static void encode_slice_params(VC2EncContext *s)
> +{
> +    put_vc2_ue_uint(&s->pb, s->num_x);
> +    put_vc2_ue_uint(&s->pb, s->num_y);
> +    put_vc2_ue_uint(&s->pb, s->prefix_bytes);
> +    put_vc2_ue_uint(&s->pb, s->size_scaler);
> +}
> +
> +/* 1st idx = LL, second - vertical, third - horizontal, fourth - total */
> +static const uint8_t vc2_qm_col_tab[][4] = {
> +    {20,  9, 15,  4},
> +    { 0,  6,  6,  4},
> +    { 0,  3,  3,  5},
> +    { 0,  3,  5,  1},
> +    { 0, 11, 10, 11}
> +};
> +
> +static const uint8_t vc2_qm_flat_tab[][4] = {
> +    { 0,  0,  0,  0},
> +    { 0,  0,  0,  0},
> +    { 0,  0,  0,  0},
> +    { 0,  0,  0,  0},
> +    { 0,  0,  0,  0}
> +};
> +
> +void init_quant_matrix(VC2EncContext *s)
> +{
> +    int level, orientation;
> +
> +    if (s->wavelet_depth <= 4 && s->quant_matrix == VC2_QM_DEF) {
> +        s->custom_quant_matrix = 0;
> +        for (level = 0; level < s->wavelet_depth; level++) {
> +            s->quant[level][0] = ff_dirac_default_qmat[s->wavelet_idx][level][0];
> +            s->quant[level][1] = ff_dirac_default_qmat[s->wavelet_idx][level][1];
> +            s->quant[level][2] = ff_dirac_default_qmat[s->wavelet_idx][level][2];
> +            s->quant[level][3] = ff_dirac_default_qmat[s->wavelet_idx][level][3];
> +        }
> +        return;
> +    }
> +
> +    s->custom_quant_matrix = 1;
> +
> +    if (s->quant_matrix == VC2_QM_DEF) {
> +        for (level = 0; level < s->wavelet_depth; level++) {
> +            for (orientation = 0; orientation < 4; orientation++) {
> +                if (level <= 3)
> +                    s->quant[level][orientation] = ff_dirac_default_qmat[s->wavelet_idx][level][orientation];
> +                else
> +                    s->quant[level][orientation] = vc2_qm_col_tab[level][orientation];
> +            }
> +        }
> +    } else if (s->quant_matrix == VC2_QM_COL) {
> +        for (level = 0; level < s->wavelet_depth; level++) {
> +            for (orientation = 0; orientation < 4; orientation++) {
> +                s->quant[level][orientation] = vc2_qm_col_tab[level][orientation];
> +            }
> +        }
> +    } else {
> +        for (level = 0; level < s->wavelet_depth; level++) {
> +            for (orientation = 0; orientation < 4; orientation++) {
> +                s->quant[level][orientation] = vc2_qm_flat_tab[level][orientation];
> +            }
> +        }
> +    }
> +}
> +
> +/* VC-2 12.3.4.2 - quant_matrix() */
> +static void encode_quant_matrix(VC2EncContext *s)
> +{
> +    int level;
> +    put_bits(&s->pb, 1, s->custom_quant_matrix);
> +    if (s->custom_quant_matrix) {
> +        put_vc2_ue_uint(&s->pb, s->quant[0][0]);
> +        for (level = 0; level < s->wavelet_depth; level++) {
> +            put_vc2_ue_uint(&s->pb, s->quant[level][1]);
> +            put_vc2_ue_uint(&s->pb, s->quant[level][2]);
> +            put_vc2_ue_uint(&s->pb, s->quant[level][3]);
> +        }
> +    }
> +}
> +
> +/* VC-2 12.3 - transform_parameters() */
> +static void encode_transform_params(VC2EncContext *s)
> +{
> +    put_vc2_ue_uint(&s->pb, s->wavelet_idx);
> +    put_vc2_ue_uint(&s->pb, s->wavelet_depth);
> +
> +    encode_slice_params(s);
> +    encode_quant_matrix(s);
> +}
> +
> +/* VC-2 12.2 - wavelet_transform() */
> +static void encode_wavelet_transform(VC2EncContext *s)
> +{
> +    encode_transform_params(s);
> +    align_put_bits(&s->pb);
> +}
> +
> +/* VC-2 12 - picture_parse() */
> +void encode_picture_start(VC2EncContext *s)
> +{
> +    align_put_bits(&s->pb);
> +    encode_picture_header(s);
> +    align_put_bits(&s->pb);
> +    encode_wavelet_transform(s);
> +}
> diff --git a/libavcodec/vc2enc_common.h b/libavcodec/vc2enc_common.h
> new file mode 100644
> index 0000000000..dfcd77752c
> --- /dev/null
> +++ b/libavcodec/vc2enc_common.h
> @@ -0,0 +1,279 @@
> +/*
> + * Copyright (C) 2016 Open Broadcast Systems Ltd.
> + * Author        2016 Rostislav Pehlivanov <atomnuker at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_VC2ENC_COMMON_H
> +#define AVCODEC_VC2ENC_COMMON_H
> +
> +#include "avcodec.h"
> +#include "dirac.h"
> +#include "put_bits.h"
> +
> +#include "vc2enc_dwt.h"
> +#include "diractab.h"
> +#include "libavutil/vulkan.h"
> +
> +/* The limited size resolution of each slice forces us to do this */
> +#define SSIZE_ROUND(b) (FFALIGN((b), s->size_scaler) + 4 + s->prefix_bytes)
> +
> +/* Decides the cutoff point in # of slices to distribute the leftover bytes */
> +#define SLICE_REDIST_TOTAL 150
> +
> +typedef struct VC2BaseVideoFormat {
> +    enum AVPixelFormat pix_fmt;
> +    AVRational time_base;
> +    int width, height;
> +    uint8_t interlaced, level;
> +    char name[13];
> +} VC2BaseVideoFormat;
> +
> +static const VC2BaseVideoFormat base_video_fmts[] = {
> +    { 0 }, /* Custom format, here just to make indexing equal to base_vf */
> +    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  176,  120, 0, 1,     "QSIF525" },
> +    { AV_PIX_FMT_YUV420P,   {    2,    25 },  176,  144, 0, 1,     "QCIF"    },
> +    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  352,  240, 0, 1,     "SIF525"  },
> +    { AV_PIX_FMT_YUV420P,   {    2,    25 },  352,  288, 0, 1,     "CIF"     },
> +    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  704,  480, 0, 1,     "4SIF525" },
> +    { AV_PIX_FMT_YUV420P,   {    2,    25 },  704,  576, 0, 1,     "4CIF"    },
> +
> +    { AV_PIX_FMT_YUV422P10, { 1001, 30000 },  720,  480, 1, 2,   "SD480I-60" },
> +    { AV_PIX_FMT_YUV422P10, {    1,    25 },  720,  576, 1, 2,   "SD576I-50" },
> +
> +    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1280,  720, 0, 3,  "HD720P-60"  },
> +    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 1280,  720, 0, 3,  "HD720P-50"  },
> +    { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 1920, 1080, 1, 3,  "HD1080I-60" },
> +    { AV_PIX_FMT_YUV422P10, {    1,    25 }, 1920, 1080, 1, 3,  "HD1080I-50" },
> +    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1920, 1080, 0, 3,  "HD1080P-60" },
> +    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 1920, 1080, 0, 3,  "HD1080P-50" },
> +
> +    { AV_PIX_FMT_YUV444P12, {    1,    24 }, 2048, 1080, 0, 4,        "DC2K" },
> +    { AV_PIX_FMT_YUV444P12, {    1,    24 }, 4096, 2160, 0, 5,        "DC4K" },
> +
> +    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 3840, 2160, 0, 6, "UHDTV 4K-60" },
> +    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 3840, 2160, 0, 6, "UHDTV 4K-50" },
> +
> +    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 7680, 4320, 0, 7, "UHDTV 8K-60" },
> +    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 7680, 4320, 0, 7, "UHDTV 8K-50" },
> +
> +    { AV_PIX_FMT_YUV422P10, { 1001, 24000 }, 1920, 1080, 0, 3,  "HD1080P-24" },
> +    { AV_PIX_FMT_YUV422P10, { 1001, 30000 },  720,  486, 1, 2,  "SD Pro486"  },
> +};
> +static const int base_video_fmts_len = FF_ARRAY_ELEMS(base_video_fmts);
> +
> +enum VC2_QM {
> +    VC2_QM_DEF = 0,
> +    VC2_QM_COL,
> +    VC2_QM_FLAT,
> +
> +    VC2_QM_NB
> +};
> +
> +typedef struct SubBand {
> +    dwtcoef *buf;
> +    ptrdiff_t stride;
> +    int width;
> +    int height;
> +    int shift;
> +} SubBand;
> +
> +typedef struct Plane {
> +    SubBand band[MAX_DWT_LEVELS][4];
> +    dwtcoef *coef_buf;
> +    int width;
> +    int height;
> +    int dwt_width;
> +    int dwt_height;
> +    ptrdiff_t coef_stride;
> +} Plane;
> +
> +typedef struct SliceArgs {
> +    const struct VC2EncContext *ctx;
> +    union {
> +        int cache[DIRAC_MAX_QUANT_INDEX];
> +        uint8_t *buf;
> +    };
> +    int x;
> +    int y;
> +    int quant_idx;
> +    int bits_ceil;
> +    int bits_floor;
> +    int bytes;
> +} SliceArgs;
> +
> +typedef struct TransformArgs {
> +    struct VC2EncContext *ctx;
> +    Plane *plane;
> +    const void *idata;
> +    ptrdiff_t istride;
> +    int field;
> +    VC2TransformContext t;
> +} TransformArgs;
> +
> +typedef struct VC2DwtPlane {
> +    int width;
> +    int height;
> +    int dwt_width;
> +    int dwt_height;
> +} VC2DwtPlane;
> +
> +typedef struct VC2DwtPushData {
> +    int s;
> +    union {
> +        int diff_offset;
> +        int plane_idx;
> +    };
> +    int level;
> +    VC2DwtPlane planes[3];
> +    VkDeviceAddress pbuf[3];
> +} VC2DwtPushData;
> +
> +typedef struct VC2EncAuxData {
> +    uint32_t quant[MAX_DWT_LEVELS][4];
> +    int ff_dirac_qscale_tab[116];
> +} VC2EncAuxData;
> +
> +typedef struct VC2EncPushData {
> +    VkDeviceAddress p[3];
> +    VkDeviceAddress pb;
> +    VkDeviceAddress luts;
> +    VkDeviceAddress slice;
> +    int num_x;
> +    int num_y;
> +    VC2DwtPlane planes[3];
> +    int wavelet_depth;
> +    int size_scaler;
> +    int prefix_bytes;
> +} VC2EncPushData;
> +
> +typedef struct VC2EncSliceArgs {
> +    int quant_idx;
> +    int bytes;
> +    int pb_start;
> +    int pad;
> +} VC2EncSliceArgs;
> +
> +typedef struct VC2EncSliceCalcPushData {
> +    VkDeviceAddress p[3];
> +    VkDeviceAddress luts;
> +    VkDeviceAddress slice;
> +    int num_x;
> +    int num_y;
> +    VC2DwtPlane planes[3];
> +    int wavelet_depth;
> +    int size_scaler;
> +    int prefix_bytes;
> +    int bits_ceil;
> +    int bits_floor;
> +} VC2EncSliceCalcPushData;
> +
> +typedef struct VC2EncContext {
> +    AVClass *av_class;
> +    PutBitContext pb;
> +    Plane plane[3];
> +    AVCodecContext *avctx;
> +    DiracVersionInfo ver;
> +
> +    SliceArgs *slice_args;
> +    VC2EncSliceArgs* vk_slice_args;
> +    TransformArgs transform_args[3];
> +
> +    /* For conversion from unsigned pixel values to signed */
> +    int diff_offset;
> +    int bpp;
> +    int bpp_idx;
> +
> +    /* Picture number */
> +    uint32_t picture_number;
> +
> +    /* Base video format */
> +    int base_vf;
> +    int level;
> +    int profile;
> +
> +    /* Quantization matrix */
> +    uint8_t quant[MAX_DWT_LEVELS][4];
> +    int custom_quant_matrix;
> +
> +    /* Division LUT */
> +    uint32_t qmagic_lut[116][2];
> +
> +    int num_x; /* #slices horizontally */
> +    int num_y; /* #slices vertically */
> +    int group_x;
> +    int group_y;
> +    int prefix_bytes;
> +    int size_scaler;
> +    int chroma_x_shift;
> +    int chroma_y_shift;
> +
> +    /* Rate control stuff */
> +    int frame_max_bytes;
> +    int slice_max_bytes;
> +    int slice_min_bytes;
> +    int q_ceil;
> +    int q_avg;
> +
> +    /* Options */
> +    double tolerance;
> +    int wavelet_idx;
> +    int wavelet_depth;
> +    int strict_compliance;
> +    int slice_height;
> +    int slice_width;
> +    int interlaced;
> +    enum VC2_QM quant_matrix;
> +
> +    /* Parse code state */
> +    uint32_t next_parse_offset;
> +    enum DiracParseCodes last_parse_code;
> +
> +    /* Vulkan state */
> +    FFVulkanContext vkctx;
> +    FFVkQueueFamilyCtx qf;
> +    FFVkExecPool e;
> +
> +    FFVulkanShader dwt_haar_shd;
> +    FFVulkanShader dwt_upload_shd;
> +    FFVulkanShader dwt_hor_shd, dwt_ver_shd;
> +    FFVulkanShader slice_shd;
> +    FFVulkanShader enc_shd;
> +    AVBufferPool* dwt_buf_pool;
> +    int haar_subgroup;
> +
> +    VkBuffer plane_buf, slice_buf;
> +    uint32_t buf_plane_size;
> +    VC2EncPushData enc_consts;
> +    VC2DwtPushData dwt_consts;
> +    VC2EncSliceCalcPushData calc_consts;
> +} VC2EncContext;



> +
> +void put_vc2_ue_uint(PutBitContext *pb, uint32_t val);
> +
> +int count_vc2_ue_uint(uint32_t val);


These ones you should just expose as inline functions in the header.

> +
> +void init_quant_matrix(VC2EncContext *s);
> +
> +void encode_parse_info(VC2EncContext *s, enum DiracParseCodes pcode);
> +
> +void encode_seq_header(VC2EncContext *s);
> +
> +void encode_picture_start(VC2EncContext *s);


Could you prefix the functions properly?

ff_vc2_...


Also, could you split this off into a separate commit so it can be 
merged first?


> +
> +#endif
> diff --git a/libavcodec/vc2enc_vulkan.c b/libavcodec/vc2enc_vulkan.c
> new file mode 100644
> index 0000000000..195f0273a3
> --- /dev/null
> +++ b/libavcodec/vc2enc_vulkan.c
> @@ -0,0 +1,786 @@
> +/*
> + * Copyright (C) 2016 Open Broadcast Systems Ltd.
> + * Author        2016 Rostislav Pehlivanov <atomnuker at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/version.h"
> +#include "libavutil/vulkan_spirv.h"
> +#include "libavutil/hwcontext_vulkan.h"
> +#include "libavutil/vulkan_loader.h"
> +#include "libavutil/vulkan.h"
> +#include "codec_internal.h"
> +#include "internal.h"
> +#include "encode.h"
> +#include "version.h"
> +#include "vc2enc_common.h"
> +#include "hwconfig.h"
> +
> +#define LEGALL_WORKGROUP_X 64
> +#define SLICE_WORKGROUP_X 128
> +
> +extern const char *ff_source_common_comp;
> +extern const char *ff_source_vc2_encode_comp;
> +extern const char *ff_source_vc2_dwt_hor_legall_comp;
> +extern const char *ff_source_vc2_dwt_ver_legall_comp;
> +extern const char *ff_source_vc2_slice_sizes_comp;
> +extern const char *ff_source_vc2_dwt_upload_comp;
> +extern const char *ff_source_vc2_dwt_haar_comp;
> +extern const char *ff_source_vc2_dwt_haar_subgroup_comp;
> +
> +static int init_vulkan_pipeline(VC2EncContext* s, FFVkSPIRVCompiler *spv,
> +                                FFVulkanShader* shd, int push_size,
> +                                int lg_x, int lg_y, int lg_z,
> +                                const char* pl_name, const char* pl_source,
> +                                int plane_img)
> +{
> +    uint8_t *spv_data;
> +    size_t spv_len;
> +    void *spv_opaque = NULL;
> +    FFVulkanContext *vkctx = &s->vkctx;
> +    FFVulkanDescriptorSetBinding *desc;
> +    int err = 0;
> +
> +    ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT,
> +                      NULL, 0, lg_x, lg_y, lg_z, 0);
> +
> +    if (plane_img) {
> +        desc = (FFVulkanDescriptorSetBinding []) {
> +            {
> +                .name       = "plane_imgs",
> +                .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
> +                .mem_layout = ff_vk_shader_rep_fmt(vkctx->frames->sw_format, FF_VK_REP_NATIVE),
> +                .dimensions = 2,
> +                .elems      = 3,
> +                .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
> +            },
> +        };
> +        RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 1, 0, 0));
> +    }
> +
> +    ff_vk_shader_add_push_const(shd, 0, push_size, VK_SHADER_STAGE_COMPUTE_BIT);
> +    if (strcmp(pl_name, "enc_pl") == 0) {
> +        av_bprintf(&shd->src, "#define PB_UNALIGNED\n"                   );
> +        GLSLD(ff_source_common_comp);
> +    }
> +    GLSLD(pl_source);
> +
> +    /* Compile Haar shader */
> +    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
> +    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
> +    RET(ff_vk_shader_register_exec(vkctx, &s->e, shd));
> +
> +fail:
> +    return err;
> +}
> +
> +static int init_vulkan(AVCodecContext *avctx)
> +{
> +    VC2EncContext *s = avctx->priv_data;
> +    FFVulkanContext *vkctx = &s->vkctx;
> +    FFVkSPIRVCompiler *spv;
> +    AVBufferRef* dwt_buf = NULL;
> +    AVBufferRef* coef_buf = NULL;
> +    FFVkBuffer* vk_buf = NULL;
> +    VC2EncAuxData* ad = NULL;
> +    Plane *p;
> +    VC2DwtPlane vk_plane;
> +    int i, level, ret;
> +    uint32_t subgroup_size = vkctx->subgroup_props.maxSubgroupSize;
> +
> +    /* Initialize spirv compiler */
> +    spv = ff_vk_spirv_init();
> +    if (!spv) {
> +        av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
> +        return -1;
> +    }
> +
> +    ff_vk_qf_init(vkctx, &s->qf, VK_QUEUE_COMPUTE_BIT);
> +    ff_vk_exec_pool_init(vkctx, &s->qf, &s->e, 1, 0, 0, 0, NULL);
> +
> +    /* Allocate coefficient buffer for each plane */
> +    p = &s->plane[0];
> +    s->buf_plane_size = p->coef_stride*p->dwt_height*sizeof(dwtcoef);
> +    ret = ff_vk_get_pooled_buffer(vkctx, &s->dwt_buf_pool, &coef_buf,
> +                                  VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
> +                                  VK_BUFFER_USAGE_TRANSFER_DST_BIT, NULL,
> +                                  s->buf_plane_size * 3,
> +                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
> +    vk_buf = (FFVkBuffer*)coef_buf->data;
> +    s->plane_buf = vk_buf->buf;
> +
> +    for (i = 0; i < 3; i++) {
> +        p = &s->plane[i];
> +        vk_plane.dwt_width = p->dwt_width;
> +        vk_plane.dwt_height = p->dwt_height;
> +        vk_plane.width = p->width;
> +        vk_plane.height = p->height;
> +        memcpy(&s->calc_consts.planes[i], &vk_plane, sizeof(vk_plane));
> +        memcpy(&s->dwt_consts.planes[i], &vk_plane, sizeof(vk_plane));
> +        memcpy(&s->enc_consts.planes[i], &vk_plane, sizeof(vk_plane));
> +        s->enc_consts.p[i] = vk_buf->address + s->buf_plane_size * i;
> +        s->calc_consts.p[i] = vk_buf->address + s->buf_plane_size * i;
> +        s->dwt_consts.pbuf[i] = vk_buf->address + s->buf_plane_size * i;
> +    }
> +
> +    /* Initialize Haar push data */
> +    s->dwt_consts.diff_offset = s->diff_offset;
> +    s->dwt_consts.s = s->wavelet_idx == VC2_TRANSFORM_HAAR_S ? 1 : 0;
> +    s->dwt_consts.level = 0;
> +
> +    /* Initializer slice calc push data */
> +    s->calc_consts.num_x = s->num_x;
> +    s->calc_consts.num_y = s->num_y;
> +    s->calc_consts.wavelet_depth = s->wavelet_depth;
> +    s->calc_consts.prefix_bytes = s->prefix_bytes;
> +
> +    /* Initialize encoder push data */
> +    s->enc_consts.wavelet_depth = s->wavelet_depth;
> +    s->enc_consts.num_x = s->num_x;
> +    s->enc_consts.num_y = s->num_y;
> +
> +    /* Create buffer for encoder auxilary data. */
> +    ret = ff_vk_get_pooled_buffer(vkctx, &s->dwt_buf_pool, &dwt_buf,
> +                                  VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
> +                                  sizeof(VC2EncAuxData),
> +                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
> +                                      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);


Indentation?


> +    vk_buf = (FFVkBuffer*)dwt_buf->data;
> +    s->calc_consts.luts = vk_buf->address;
> +    s->enc_consts.luts = vk_buf->address;
> +    ad = (VC2EncAuxData*)vk_buf->mapped_mem;
> +    if (s->wavelet_depth <= 4 && s->quant_matrix == VC2_QM_DEF) {
> +        s->custom_quant_matrix = 0;
> +        for (level = 0; level < s->wavelet_depth; level++) {
> +            ad->quant[level][0] = ff_dirac_default_qmat[s->wavelet_idx][level][0];
> +            ad->quant[level][1] = ff_dirac_default_qmat[s->wavelet_idx][level][1];
> +            ad->quant[level][2] = ff_dirac_default_qmat[s->wavelet_idx][level][2];
> +            ad->quant[level][3] = ff_dirac_default_qmat[s->wavelet_idx][level][3];
> +        }
> +    }
> +    memcpy(ad->ff_dirac_qscale_tab, ff_dirac_qscale_tab, sizeof(ff_dirac_qscale_tab));
> +
> +    /* Create buffer for slice arguments */
> +    ret = ff_vk_get_pooled_buffer(vkctx, &s->dwt_buf_pool, &dwt_buf,
> +                                  VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
> +                                  sizeof(VC2EncSliceArgs) * s->num_x * s->num_y,
> +                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
> +                                      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
> +    vk_buf = (FFVkBuffer*)dwt_buf->data;
> +    s->slice_buf = vk_buf->buf;
> +    s->vk_slice_args = (VC2EncSliceArgs*)vk_buf->mapped_mem;
> +    s->calc_consts.slice = vk_buf->address;
> +    s->enc_consts.slice = vk_buf->address;
> +
> +    s->haar_subgroup = 0;
> +
> +    /* Initialize encoding pipelines */
> +    init_vulkan_pipeline(s, spv, &s->dwt_upload_shd, sizeof(VC2DwtPushData),
> +                         8, 8, 1, "dwt_upload_pl", ff_source_vc2_dwt_upload_comp, 1);
> +    init_vulkan_pipeline(s, spv, &s->slice_shd, sizeof(VC2EncPushData),
> +                         128, 1, 1, "slice_pl", ff_source_vc2_slice_sizes_comp, 0);
> +    init_vulkan_pipeline(s, spv, &s->enc_shd, sizeof(VC2EncPushData),
> +                         128, 1, 1, "enc_pl", ff_source_vc2_encode_comp, 0);
> +
> +    if (s->wavelet_idx == VC2_TRANSFORM_HAAR || s->wavelet_idx == VC2_TRANSFORM_HAAR_S) {
> +        if (subgroup_size == 32 && s->wavelet_depth < 3) {
> +            init_vulkan_pipeline(s, spv, &s->dwt_haar_shd, sizeof(VC2DwtPushData),
> +                                 64, 1, 1, "dwt_haar_pl", ff_source_vc2_dwt_haar_subgroup_comp, 0);
> +            s->haar_subgroup = 1;
> +        } else if (subgroup_size == 64 && s->wavelet_depth < 4) {
> +            init_vulkan_pipeline(s, spv, &s->dwt_haar_shd, sizeof(VC2DwtPushData),
> +                                 64, 1, 1, "dwt_haar_pl", ff_source_vc2_dwt_haar_subgroup_comp, 0);
> +            s->haar_subgroup = 1;
> +        } else {
> +            init_vulkan_pipeline(s, spv, &s->dwt_haar_shd, sizeof(VC2DwtPushData),
> +                                 16, 16, 1, "dwt_haar_pl", ff_source_vc2_dwt_haar_comp, 0);
> +        }
> +    } else if (s->wavelet_idx == VC2_TRANSFORM_5_3) {
> +        init_vulkan_pipeline(s, spv, &s->dwt_hor_shd, sizeof(VC2DwtPushData),
> +                             64, 1, 1, "dwt_hor_pl", ff_source_vc2_dwt_hor_legall_comp, 0);
> +        init_vulkan_pipeline(s, spv, &s->dwt_ver_shd, sizeof(VC2DwtPushData),
> +                             64, 1, 1, "dwt_ver_pl", ff_source_vc2_dwt_ver_legall_comp, 0);
> +    }
> +
> +    s->group_x = s->plane[0].dwt_width >> 3;
> +    s->group_y = s->plane[0].dwt_height >> 3;
> +    return ret;
> +}
> +
> +static void dwt_plane_haar(VC2EncContext *s, FFVkExecContext *exec, VkBufferMemoryBarrier2* buf_bar)
> +{
> +    int p, group_x, group_y;
> +    FFVulkanContext *vkctx = &s->vkctx;
> +    FFVulkanFunctions *vk = &vkctx->vkfn;
> +
> +    s->dwt_consts.level = s->wavelet_depth;
> +    ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_haar_shd);
> +
> +    /* Haar pass */
> +    for (p = 0; p < 3; p++) {
> +        s->dwt_consts.plane_idx = p;
> +        if (s->haar_subgroup) {
> +            group_x = FFALIGN(s->plane[p].dwt_width, 8) >> 3;
> +            group_y = FFALIGN(s->plane[p].dwt_height, 8) >> 3;
> +        } else {
> +            group_x = FFALIGN(s->plane[p].dwt_width, 16) >> 4;
> +            group_y = FFALIGN(s->plane[p].dwt_height, 16) >> 4;
> +        }
> +
> +        ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_haar_shd, VK_SHADER_STAGE_COMPUTE_BIT,
> +                                       0, sizeof(VC2DwtPushData), &s->dwt_consts);
> +        vk->CmdDispatch(exec->buf, group_x, group_y, 1);
> +    }
> +
> +    /* Wait for Haar dispatches to complete */
> +    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
> +                                           .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
> +                                           .pBufferMemoryBarriers = buf_bar,
> +                                           .bufferMemoryBarrierCount = 1U,
> +                                       });
> +}
> +
> +static void dwt_plane_legall(VC2EncContext *s, FFVkExecContext *exec, VkBufferMemoryBarrier2* buf_bar)
> +{
> +    int i;
> +    int legall_group_x = (s->plane[0].dwt_height + LEGALL_WORKGROUP_X - 1) >> 6;
> +    int legall_group_y = (s->plane[0].dwt_width + LEGALL_WORKGROUP_X - 1) >> 6;
> +    FFVulkanContext *vkctx = &s->vkctx;
> +    FFVulkanFunctions *vk = &vkctx->vkfn;
> +
> +    /* Perform Haar wavelet trasform */
> +    for (i = 0; i < s->wavelet_depth; i++) {
> +        s->dwt_consts.level = i;
> +
> +        /* Horizontal Haar pass */
> +        ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_hor_shd);
> +        ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_hor_shd, VK_SHADER_STAGE_COMPUTE_BIT,
> +                                       0, sizeof(VC2DwtPushData), &s->dwt_consts);
> +        vk->CmdDispatch(exec->buf, legall_group_x, 1, 3);
> +        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
> +                                               .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
> +                                               .pBufferMemoryBarriers = buf_bar,
> +                                               .bufferMemoryBarrierCount = 1U,
> +                                           });
> +
> +        /* Vertical Haar pass */
> +        ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_ver_shd);
> +        ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_ver_shd, VK_SHADER_STAGE_COMPUTE_BIT,
> +                                       0, sizeof(VC2DwtPushData), &s->dwt_consts);
> +        vk->CmdDispatch(exec->buf, legall_group_y, 1, 3);
> +        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
> +                                               .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
> +                                               .pBufferMemoryBarriers = buf_bar,
> +                                               .bufferMemoryBarrierCount = 1U,
> +                                           });
> +    }
> +}
> +
> +static void dwt_plane(VC2EncContext *s, FFVkExecContext *exec, AVFrame *frame)
> +{
> +    int group_x = s->group_x, group_y = s->group_y;
> +    FFVulkanContext *vkctx = &s->vkctx;
> +    FFVulkanFunctions *vk = &vkctx->vkfn;
> +    uint32_t num_slice_groups = (s->num_x*s->num_y + SLICE_WORKGROUP_X - 1) >> 7;
> +    VkBufferMemoryBarrier2 buf_bar;
> +    VkBufferMemoryBarrier2 slice_buf_bar;
> +    VkImageView views[AV_NUM_DATA_POINTERS];
> +    VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS];
> +    int nb_img_bar = 0;
> +
> +    buf_bar = (VkBufferMemoryBarrier2) {
> +        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
> +        .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
> +        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
> +        .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT,
> +        .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT,
> +        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
> +        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
> +        .buffer = s->plane_buf,
> +        .size = s->buf_plane_size * 3,
> +        .offset = 0,
> +    };
> +
> +    ff_vk_exec_add_dep_frame(vkctx, exec, frame,
> +                             VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
> +                             VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT);
> +    ff_vk_create_imageviews(vkctx, exec, views, frame, FF_VK_REP_UINT);
> +    ff_vk_frame_barrier(vkctx, exec, frame, img_bar, &nb_img_bar,
> +                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
> +                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
> +                        VK_ACCESS_SHADER_READ_BIT,
> +                        VK_IMAGE_LAYOUT_GENERAL,
> +                        VK_QUEUE_FAMILY_IGNORED);
> +    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
> +                                           .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
> +                                           .pImageMemoryBarriers = img_bar,
> +                                           .imageMemoryBarrierCount = nb_img_bar,
> +                                       });
> +
> +    ff_vk_shader_update_img_array(vkctx, exec, &s->dwt_upload_shd, frame, views, 0, 0,
> +                                      VK_IMAGE_LAYOUT_GENERAL,
> +                                      VK_NULL_HANDLE);
> +
> +    /* Upload coefficients from planes to the buffer. */
> +    s->dwt_consts.diff_offset = s->diff_offset;
> +    ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_upload_shd);
> +    ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_upload_shd, VK_SHADER_STAGE_COMPUTE_BIT,
> +                                   0, sizeof(VC2DwtPushData), &s->dwt_consts);
> +    vk->CmdDispatch(exec->buf, group_x, group_y, 3);
> +    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
> +                                           .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
> +                                           .pBufferMemoryBarriers = &buf_bar,
> +                                           .bufferMemoryBarrierCount = 1U,
> +                                       });
> +
> +    /* Perform Haar wavelet trasform */
> +    if (s->wavelet_idx == VC2_TRANSFORM_HAAR || s->wavelet_idx == VC2_TRANSFORM_HAAR_S) {
> +        dwt_plane_haar(s, exec, &buf_bar);
> +    } else if (s->wavelet_idx == VC2_TRANSFORM_5_3) {
> +        dwt_plane_legall(s, exec, &buf_bar);
> +    }
> +
> +    /* Calculate slice sizes. */
> +    ff_vk_exec_bind_shader(vkctx, exec, &s->slice_shd);
> +    ff_vk_shader_update_push_const(vkctx, exec, &s->slice_shd, VK_SHADER_STAGE_COMPUTE_BIT,
> +                                   0, sizeof(VC2EncSliceCalcPushData), &s->calc_consts);
> +    vk->CmdDispatch(exec->buf, num_slice_groups, 1, 1);
> +
> +    slice_buf_bar = (VkBufferMemoryBarrier2) {
> +        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
> +        .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
> +        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
> +        .srcAccessMask = VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT,
> +        .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT,
> +        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
> +        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
> +        .buffer = s->slice_buf,
> +        .size = sizeof(VC2EncSliceArgs) * s->num_x * s->num_y,
> +        .offset = 0,
> +    };
> +    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
> +                                           .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
> +                                           .pBufferMemoryBarriers = &slice_buf_bar,
> +                                           .bufferMemoryBarrierCount = 1U,
> +                                       });
> +}
> +
> +static void vulkan_encode_slices(VC2EncContext *s, FFVkExecContext *exec)
> +{
> +    uint32_t num_slice_groups = (s->num_x*s->num_y + SLICE_WORKGROUP_X - 1) >> 7;
> +    FFVulkanContext *vkctx = &s->vkctx;
> +    FFVulkanFunctions *vk = &vkctx->vkfn;
> +    int skip = 0;
> +
> +    flush_put_bits(&s->pb);
> +    s->enc_consts.pb += put_bytes_output(&s->pb);
> +
> +    ff_vk_exec_bind_shader(vkctx, exec, &s->enc_shd);
> +    ff_vk_shader_update_push_const(vkctx, exec, &s->enc_shd, VK_SHADER_STAGE_COMPUTE_BIT,
> +                                   0, sizeof(VC2EncPushData), &s->enc_consts);
> +
> +    vk->CmdDispatch(exec->buf, num_slice_groups, 1, 1);
> +
> +    ff_vk_exec_submit(vkctx, exec);
> +    ff_vk_exec_wait(vkctx, exec);
> +
> +    for (int slice_y = 0; slice_y < s->num_y; slice_y++) {
> +        for (int slice_x = 0; slice_x < s->num_x; slice_x++) {
> +            VC2EncSliceArgs *args = &s->vk_slice_args[s->num_x*slice_y + slice_x];
> +            skip += args->bytes;
> +        }
> +    }
> +
> +    /* Skip forward to write end header */
> +    skip_put_bytes(&s->pb, skip);
> +}
> +
> +static int encode_frame(VC2EncContext *s, AVPacket *avpkt, const AVFrame *frame,
> +                        const char *aux_data, const int header_size, int field)
> +{
> +    int ret;
> +    int64_t max_frame_bytes;
> +    AVBufferRef *avpkt_buf = NULL;
> +    FFVkBuffer* buf_vk = NULL;
> +    FFVulkanContext *vkctx = &s->vkctx;
> +    FFVkExecContext *exec = ff_vk_exec_get(vkctx, &s->e);
> +
> +    ff_vk_exec_start(vkctx, exec);
> +
> +    /* Perform Haar DWT pass on the inpute frame. */
> +    dwt_plane(s, exec, (AVFrame*)frame);
> +
> +    /* Allocate a buffer that can fit at all all 3 planes of data */
> +    max_frame_bytes = header_size + s->avctx->width * s->avctx->height * sizeof(dwtcoef);
> +    s->custom_quant_matrix = 0;
> +
> +    /* Get a pooled device local host visible buffer for writing output data */
> +    if (field < 2) {
> +        ret = ff_vk_get_pooled_buffer(vkctx, &s->dwt_buf_pool, &avpkt_buf,
> +                                      VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
> +                                          VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL,
> +                                      max_frame_bytes << s->interlaced,
> +                                      VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
> +                                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
> +                                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);


Indentation.


> +        avpkt->buf = avpkt_buf;
> +        buf_vk = (FFVkBuffer *)avpkt_buf->data;
> +        avpkt->data = buf_vk->mapped_mem;
> +        avpkt->size = max_frame_bytes << s->interlaced;
> +        s->enc_consts.pb = buf_vk->address;
> +
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        init_put_bits(&s->pb, avpkt->data, avpkt->size);
> +    }
> +
> +    /* Sequence header */
> +    encode_parse_info(s, DIRAC_PCODE_SEQ_HEADER);
> +    encode_seq_header(s);
> +
> +    /* Encoder version */
> +    if (aux_data) {
> +        encode_parse_info(s, DIRAC_PCODE_AUX);
> +        ff_put_string(&s->pb, aux_data, 1);
> +    }
> +
> +    /* Picture header */
> +    encode_parse_info(s, DIRAC_PCODE_PICTURE_HQ);
> +    encode_picture_start(s);
> +
> +    /* Encode slices */
> +    vulkan_encode_slices(s, exec);
> +
> +    /* End sequence */
> +    encode_parse_info(s, DIRAC_PCODE_END_SEQ);
> +
> +    return 0;
> +}
> +
> +static av_cold int vc2_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
> +                                    const AVFrame *frame, int *got_packet)
> +{
> +    int ret = 0;
> +    int slice_ceil, sig_size = 256;
> +    VC2EncContext *s = avctx->priv_data;
> +    const int bitexact = avctx->flags & AV_CODEC_FLAG_BITEXACT;
> +    const char *aux_data = bitexact ? "Lavc" : LIBAVCODEC_IDENT;
> +    const int aux_data_size = bitexact ? sizeof("Lavc") : sizeof(LIBAVCODEC_IDENT);
> +    const int header_size = 100 + aux_data_size;
> +    int64_t r_bitrate = avctx->bit_rate >> (s->interlaced);
> +
> +    s->avctx = avctx;
> +    s->size_scaler = 2;
> +    s->prefix_bytes = 0;
> +    s->last_parse_code = 0;
> +    s->next_parse_offset = 0;
> +
> +    /* Rate control */
> +    s->frame_max_bytes = (av_rescale(r_bitrate, s->avctx->time_base.num,
> +                                     s->avctx->time_base.den) >> 3) - header_size;
> +    s->slice_max_bytes = slice_ceil = av_rescale(s->frame_max_bytes, 1, s->num_x*s->num_y);
> +
> +    /* Find an appropriate size scaler */
> +    while (sig_size > 255) {
> +        int r_size = SSIZE_ROUND(s->slice_max_bytes);
> +        if (r_size > slice_ceil) {
> +            s->slice_max_bytes -= r_size - slice_ceil;
> +            r_size = SSIZE_ROUND(s->slice_max_bytes);
> +        }
> +        sig_size = r_size/s->size_scaler; /* Signalled slize size */
> +        s->size_scaler <<= 1;
> +    }
> +
> +    s->slice_min_bytes = s->slice_max_bytes - s->slice_max_bytes*(s->tolerance/100.0f);
> +    if (s->slice_min_bytes < 0)
> +        return AVERROR(EINVAL);
> +
> +    /* Update slice calc push data */
> +    s->calc_consts.size_scaler = s->size_scaler;
> +    s->calc_consts.bits_ceil  = s->slice_max_bytes << 3;
> +    s->calc_consts.bits_floor = s->slice_min_bytes << 3;
> +    s->enc_consts.prefix_bytes = 0;
> +    s->enc_consts.size_scaler = s->size_scaler;
> +
> +    ret = encode_frame(s, avpkt, frame, aux_data, header_size, s->interlaced);
> +    if (ret)
> +        return ret;
> +    if (s->interlaced) {
> +        ret = encode_frame(s, avpkt, frame, aux_data, header_size, 2);
> +        if (ret)
> +            return ret;
> +    }
> +
> +    flush_put_bits(&s->pb);
> +    av_shrink_packet(avpkt, put_bytes_output(&s->pb));
> +    avpkt->flags |= AV_PKT_FLAG_KEY;
> +    *got_packet = 1;
> +
> +    return 0;
> +}
> +
> +static av_cold int vc2_encode_end(AVCodecContext *avctx)
> +{
> +    int i;
> +    VC2EncContext *s = avctx->priv_data;
> +
> +    av_log(avctx, AV_LOG_INFO, "Qavg: %i\n", s->q_avg);
> +
> +    for (i = 0; i < 3; i++) {
> +        ff_vc2enc_free_transforms(&s->transform_args[i].t);
> +        av_freep(&s->plane[i].coef_buf);
> +    }
> +
> +    return 0;
> +}
> +
> +static av_cold int vc2_encode_init(AVCodecContext *avctx)
> +{
> +    Plane *p;
> +    SubBand *b;
> +    int i, level, o, ret, depth;
> +    const AVPixFmtDescriptor *fmt;
> +    VC2EncContext *s = avctx->priv_data;
> +    FFVulkanContext *vkctx = &s->vkctx;
> +
> +    vkctx->frames_ref = av_buffer_ref(avctx->hw_frames_ctx);
> +    vkctx->frames = (AVHWFramesContext *)vkctx->frames_ref->data;
> +    vkctx->hwfc = vkctx->frames->hwctx;
> +    vkctx->device = (AVHWDeviceContext *)vkctx->frames->device_ref->data;
> +    vkctx->hwctx = vkctx->device->hwctx;
> +    vkctx->extensions = ff_vk_extensions_to_mask(vkctx->hwctx->enabled_dev_extensions,
> +                                                 vkctx->hwctx->nb_enabled_dev_extensions);
> +    ff_vk_load_functions(vkctx->device, &vkctx->vkfn, vkctx->extensions, 1, 1);
> +    ff_vk_load_props(vkctx);
> +
> +    s->picture_number = 0;
> +
> +    /* Total allowed quantization range */
> +    s->q_ceil    = DIRAC_MAX_QUANT_INDEX;
> +
> +    s->ver.major = 2;
> +    s->ver.minor = 0;
> +    s->profile   = 3;
> +    s->level     = 3;
> +
> +    s->base_vf   = -1;
> +    s->strict_compliance = 1;
> +
> +    s->q_avg = 0;
> +    s->slice_max_bytes = 0;
> +    s->slice_min_bytes = 0;
> +
> +    /* Mark unknown as progressive */
> +    s->interlaced = !((avctx->field_order == AV_FIELD_UNKNOWN) ||
> +                      (avctx->field_order == AV_FIELD_PROGRESSIVE));
> +
> +    for (i = 0; i < base_video_fmts_len; i++) {
> +        const VC2BaseVideoFormat *fmt = &base_video_fmts[i];
> +        if (avctx->pix_fmt != fmt->pix_fmt)
> +            continue;
> +        if (avctx->time_base.num != fmt->time_base.num)
> +            continue;
> +        if (avctx->time_base.den != fmt->time_base.den)
> +            continue;
> +        if (avctx->width != fmt->width)
> +            continue;
> +        if (avctx->height != fmt->height)
> +            continue;
> +        if (s->interlaced != fmt->interlaced)
> +            continue;
> +        s->base_vf = i;
> +        s->level   = base_video_fmts[i].level;
> +        break;
> +    }
> +
> +    if (s->interlaced)
> +        av_log(avctx, AV_LOG_WARNING, "Interlacing enabled!\n");
> +
> +    if ((s->slice_width  & (s->slice_width  - 1)) ||
> +        (s->slice_height & (s->slice_height - 1))) {
> +        av_log(avctx, AV_LOG_ERROR, "Slice size is not a power of two!\n");
> +        return AVERROR_UNKNOWN;
> +    }
> +
> +    if ((s->slice_width > avctx->width) ||
> +        (s->slice_height > avctx->height)) {
> +        av_log(avctx, AV_LOG_ERROR, "Slice size is bigger than the image!\n");
> +        return AVERROR_UNKNOWN;
> +    }
> +
> +    if (s->base_vf <= 0) {
> +        if (avctx->strict_std_compliance < FF_COMPLIANCE_STRICT) {
> +            s->strict_compliance = s->base_vf = 0;
> +            av_log(avctx, AV_LOG_WARNING, "Format does not strictly comply with VC2 specs\n");
> +        } else {
> +            av_log(avctx, AV_LOG_WARNING, "Given format does not strictly comply with "
> +                   "the specifications, decrease strictness to use it.\n");
> +            return AVERROR_UNKNOWN;
> +        }
> +    } else {
> +        av_log(avctx, AV_LOG_INFO, "Selected base video format = %i (%s)\n",
> +               s->base_vf, base_video_fmts[s->base_vf].name);
> +    }
> +
> +    /* Chroma subsampling */
> +    ret = av_pix_fmt_get_chroma_sub_sample(vkctx->frames->sw_format, &s->chroma_x_shift, &s->chroma_y_shift);
> +    if (ret)
> +        return ret;
> +
> +    /* Bit depth and color range index */
> +    fmt = av_pix_fmt_desc_get(vkctx->frames->sw_format);
> +    depth = fmt->comp[0].depth;
> +    if (depth == 8 && avctx->color_range == AVCOL_RANGE_JPEG) {
> +        s->bpp = 1;
> +        s->bpp_idx = 1;
> +        s->diff_offset = 128;
> +    } else if (depth == 8 && (avctx->color_range == AVCOL_RANGE_MPEG ||
> +               avctx->color_range == AVCOL_RANGE_UNSPECIFIED)) {
> +        s->bpp = 1;
> +        s->bpp_idx = 2;
> +        s->diff_offset = 128;
> +    } else if (depth == 10) {
> +        s->bpp = 2;
> +        s->bpp_idx = 3;
> +        s->diff_offset = 512;
> +    } else {
> +        s->bpp = 2;
> +        s->bpp_idx = 4;
> +        s->diff_offset = 2048;
> +    }
> +
> +    /* Planes initialization */
> +    for (i = 0; i < 3; i++) {
> +        int w, h;
> +        p = &s->plane[i];
> +        p->width      = avctx->width  >> (i ? s->chroma_x_shift : 0);
> +        p->height     = avctx->height >> (i ? s->chroma_y_shift : 0);
> +        if (s->interlaced)
> +            p->height >>= 1;
> +        p->dwt_width  = w = FFALIGN(p->width,  (1 << s->wavelet_depth));
> +        p->dwt_height = h = FFALIGN(p->height, (1 << s->wavelet_depth));
> +        p->coef_stride = FFALIGN(p->dwt_width, 32);
> +        for (level = s->wavelet_depth-1; level >= 0; level--) {
> +            w = w >> 1;
> +            h = h >> 1;
> +            for (o = 0; o < 4; o++) {
> +                b = &p->band[level][o];
> +                b->width  = w;
> +                b->height = h;
> +                b->stride = p->coef_stride;
> +                b->shift = (o > 1)*b->height*b->stride + (o & 1)*b->width;
> +            }
> +        }
> +
> +        /* DWT init */
> +        if (ff_vc2enc_init_transforms(&s->transform_args[i].t,
> +                                      s->plane[i].coef_stride,
> +                                      s->plane[i].dwt_height,
> +                                      s->slice_width, s->slice_height))
> +            return AVERROR(ENOMEM);
> +    }
> +
> +    /* Slices */
> +    s->num_x = s->plane[0].dwt_width/s->slice_width;
> +    s->num_y = s->plane[0].dwt_height/s->slice_height;
> +
> +    s->slice_args = av_calloc(s->num_x*s->num_y, sizeof(SliceArgs));
> +    if (!s->slice_args)
> +        return AVERROR(ENOMEM);
> +
> +    for (i = 0; i < 116; i++) {
> +        const uint64_t qf = ff_dirac_qscale_tab[i];
> +        const uint32_t m = av_log2(qf);
> +        const uint32_t t = (1ULL << (m + 32)) / qf;
> +        const uint32_t r = (t*qf + qf) & UINT32_MAX;
> +        if (!(qf & (qf - 1))) {
> +            s->qmagic_lut[i][0] = 0xFFFFFFFF;
> +            s->qmagic_lut[i][1] = 0xFFFFFFFF;
> +        } else if (r <= 1 << m) {
> +            s->qmagic_lut[i][0] = t + 1;
> +            s->qmagic_lut[i][1] = 0;
> +        } else {
> +            s->qmagic_lut[i][0] = t;
> +            s->qmagic_lut[i][1] = t;
> +        }
> +    }
> +    init_vulkan(avctx);
> +
> +    return 0;
> +}
> +
> +#define VC2ENC_FLAGS (AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption vc2enc_options[] = {
> +    {"tolerance",     "Max undershoot in percent", offsetof(VC2EncContext, tolerance), AV_OPT_TYPE_DOUBLE, {.dbl = 5.0f}, 0.0f, 45.0f, VC2ENC_FLAGS, .unit = "tolerance"},
> +    {"slice_width",   "Slice width",  offsetof(VC2EncContext, slice_width), AV_OPT_TYPE_INT, {.i64 = 32}, 32, 1024, VC2ENC_FLAGS, .unit = "slice_width"},
> +    {"slice_height",  "Slice height", offsetof(VC2EncContext, slice_height), AV_OPT_TYPE_INT, {.i64 = 16}, 8, 1024, VC2ENC_FLAGS, .unit = "slice_height"},
> +    {"wavelet_depth", "Transform depth", offsetof(VC2EncContext, wavelet_depth), AV_OPT_TYPE_INT, {.i64 = 4}, 1, 5, VC2ENC_FLAGS, .unit = "wavelet_depth"},
> +    {"wavelet_type",  "Transform type",  offsetof(VC2EncContext, wavelet_idx), AV_OPT_TYPE_INT, {.i64 = VC2_TRANSFORM_HAAR_S}, 0, VC2_TRANSFORMS_NB, VC2ENC_FLAGS, .unit = "wavelet_idx"},
> +        {"5_3",          "LeGall (5,3)",            0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_5_3},    INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "wavelet_idx"},
> +        {"haar",         "Haar (with shift)",       0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_HAAR_S}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "wavelet_idx"},
> +        {"haar_noshift", "Haar (without shift)",    0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_HAAR},   INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "wavelet_idx"},
> +    {"qm", "Custom quantization matrix", offsetof(VC2EncContext, quant_matrix), AV_OPT_TYPE_INT, {.i64 = VC2_QM_DEF}, 0, VC2_QM_NB, VC2ENC_FLAGS, .unit = "quant_matrix"},
> +        {"default",   "Default from the specifications", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_DEF}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "quant_matrix"},
> +        {"color",     "Prevents low bitrate discoloration", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_COL}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "quant_matrix"},
> +        {"flat",      "Optimize for PSNR", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_FLAT}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "quant_matrix"},
> +    {NULL}
> +};
> +
> +static const AVClass vc2enc_class = {
> +    .class_name = "vc2_vulkan_encoder",
> +    .category = AV_CLASS_CATEGORY_ENCODER,
> +    .option = vc2enc_options,
> +    .item_name = av_default_item_name,
> +    .version = LIBAVUTIL_VERSION_INT
> +};
> +
> +static const FFCodecDefault vc2enc_defaults[] = {
> +    { "b",              "600000000"   },
> +    { NULL },
> +};
> +
> +const AVCodecHWConfigInternal *const ff_vc2_hw_configs[] = {
> +    HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN),
> +    HW_CONFIG_ENCODER_DEVICE(NONE,  VULKAN),
> +    NULL,
> +};
> +
> +const FFCodec ff_vc2_vulkan_encoder = {
> +    .p.name         = "vc2_vulkan",
> +    CODEC_LONG_NAME("SMPTE VC-2"),
> +    .p.type         = AVMEDIA_TYPE_VIDEO,
> +    .p.id           = AV_CODEC_ID_DIRAC,
> +    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_HARDWARE,
> +    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
> +    .priv_data_size = sizeof(VC2EncContext),
> +    .init           = vc2_encode_init,
> +    .close          = vc2_encode_end,
> +    FF_CODEC_ENCODE_CB(vc2_encode_frame),
> +    .p.priv_class   = &vc2enc_class,
> +    .defaults       = vc2enc_defaults,
> +    .p.pix_fmts = (const enum AVPixelFormat[]) {
> +        AV_PIX_FMT_VULKAN,
> +        AV_PIX_FMT_NONE,
> +    },
> +    .hw_configs     = ff_vc2_hw_configs,
> +};
> diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp
> index 7572bfb4fa..b550fb0e14 100644
> --- a/libavcodec/vulkan/common.comp
> +++ b/libavcodec/vulkan/common.comp
> @@ -18,6 +18,9 @@
>    * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>    */
>   
> +#extension GL_EXT_buffer_reference : require
> +#extension GL_EXT_buffer_reference2 : require


You should use ff_vk_shader_init() to enable extensions?


> +
>   layout(buffer_reference, buffer_reference_align = 1) buffer u8buf {
>       uint8_t v;
>   };
> @@ -173,8 +176,8 @@ uint64_t put_bits_count(in PutBitContext pb)
>       return (pb.buf - pb.buf_start)*8 + BUF_BITS - pb.bit_left;
>   }
>   
> -uint32_t put_bytes_count(in PutBitContext pb)
> +int32_t put_bytes_count(in PutBitContext pb)
>   {
>       uint64_t num_bytes = (pb.buf - pb.buf_start) + ((BUF_BITS - pb.bit_left) >> 3);
> -    return uint32_t(num_bytes);
> +    return int32_t(num_bytes);
>   }
> diff --git a/libavcodec/vulkan/vc2_dwt_haar.comp b/libavcodec/vulkan/vc2_dwt_haar.comp
> new file mode 100644
> index 0000000000..69073cb17f
> --- /dev/null
> +++ b/libavcodec/vulkan/vc2_dwt_haar.comp
> @@ -0,0 +1,76 @@
> +#extension GL_EXT_scalar_block_layout : require
> +#extension GL_EXT_buffer_reference : require
> +
> +#define LOCAL_X 256
> +
> +layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
> +
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef {
> +    int coef_buf[];
> +};


You should use the definitions by including vulkan/common.comp

like ffv1enc_vulkan does, by pasting it at the start of the shader.


> +
> +struct Plane {
> +    ivec2 dim;
> +    ivec2 dwt_dim;
> +};
> +
> +layout(push_constant, scalar) uniform ComputeInfo {
> +    int s;
> +    int plane_idx;
> +    int wavelet_depth;
> +    Plane planes[3];
> +    DwtCoef pbuf[3];
> +};
> +
> +shared int local_coef[LOCAL_X];
> +
> +void main() {


Coding style: newline between () and {


> +    ivec2 coord = ivec2(gl_GlobalInvocationID.xy);
> +    ivec2 dwt_dim = planes[plane_idx].dwt_dim;
> +    if (any(greaterThanEqual(coord, dwt_dim))) {
> +        return;
> +    }


Coding style: no { } between one-line blocks.


> +    int index = dwt_dim.x * coord.y + coord.x;
> +    int value = pbuf[plane_idx].coef_buf[index];
> +
> +    /* Perform Haar wavelet on the 16x16 local workgroup with shared memory */
> +    for (int i = 0; i < wavelet_depth; i++) {
> +        ivec2 mask = ivec2((1 << i) - 1);
> +        if (any(notEqual(coord & mask, ivec2(0)))) {
> +            break;
> +        }
> +
> +        /* Offset between valid hor pixels for each level, +1, +2, +4 etc */
> +        int dist = (1 << i);
> +
> +        local_coef[gl_LocalInvocationIndex] = value;
> +        barrier();
> +
> +        /* Horizontal haar wavelet */
> +        uint other_id = gl_LocalInvocationIndex ^ dist;
> +        int other = local_coef[other_id];
> +        int a = gl_LocalInvocationIndex < other_id ? value : other;
> +        int b = gl_LocalInvocationIndex < other_id ? other : value;
> +        int dst_b = (b - a) * (1 << s);
> +        int dst_a = a * (1 << s) + ((dst_b + 1) >> 1);
> +        value = gl_LocalInvocationIndex < other_id ? dst_a : dst_b;
> +
> +        /* Offset between valid ver pixels for each level, +1, +2, +4 etc */
> +        dist <<= 4;
> +
> +        local_coef[gl_LocalInvocationIndex] = value;
> +        barrier();
> +
> +        /* Vertical haar wavelet */
> +        other_id = gl_LocalInvocationIndex ^ dist;
> +        other = local_coef[other_id];
> +        a = gl_LocalInvocationIndex < other_id ? value : other;
> +        b = gl_LocalInvocationIndex < other_id ? other : value;
> +        dst_b = b - a;
> +        dst_a = a + ((dst_b + 1) >> 1);
> +        value = gl_LocalInvocationIndex < other_id ? dst_a : dst_b;
> +    }
> +
> +    /* Store value */
> +    pbuf[plane_idx].coef_buf[index] = value;


You should use storage images instead. The new 32-bit integer format, along

with the possibility to access them in a bitexact way should be good 
enough now.

They're much faster and much more robust, not to mention imageLoads are 
cached.


> +}
> \ No newline at end of file
> diff --git a/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp b/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp
> new file mode 100644
> index 0000000000..32fb04561a
> --- /dev/null
> +++ b/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp
> @@ -0,0 +1,94 @@
> +#extension GL_EXT_scalar_block_layout : require
> +#extension GL_KHR_shader_subgroup_basic : require
> +#extension GL_KHR_shader_subgroup_shuffle : require
> +#extension GL_EXT_buffer_reference : require
> +
> +#define TILE_DIM 8
> +#define LOCAL_X 64
> +
> +layout(local_size_x = LOCAL_X, local_size_y = 1, local_size_z = 1) in;
> +
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef {
> +    int coef_buf[];
> +};
> +
> +struct Plane {
> +    ivec2 dim;
> +    ivec2 dwt_dim;
> +};
> +
> +layout(push_constant, scalar) uniform ComputeInfo {
> +    int s;
> +    int plane_idx;
> +    int wavelet_depth;
> +    Plane planes[3];
> +    DwtCoef pbuf[3];
> +};


I'd prefer to have these defined in the .c encoder file alongside their

equivalent C structure so its easy to see if there are discrepancies.


> +
> +int dwt_haar_subgroup(int value, int i) {
> +    /* Offset between valid hor pixels for each level, +1, +2, +4 etc */
> +    int dist = (1 << i);
> +
> +    /* Horizontal haar wavelet */
> +    uint other_sub_id = gl_SubgroupInvocationID ^ dist;
> +    int other = subgroupShuffle(value, other_sub_id);
> +    int a = gl_SubgroupInvocationID < other_sub_id ? value : other;
> +    int b = gl_SubgroupInvocationID < other_sub_id ? other : value;
> +    int dst_b = (b - a) * (1 << s);
> +    int dst_a = a * (1 << s) + ((dst_b + 1) >> 1);
> +    value = gl_SubgroupInvocationID < other_sub_id ? dst_a : dst_b;
> +
> +    /* Offset between valid ver pixels for each level, +1, +2, +4 etc */
> +    dist <<= 3;
> +
> +    /* Vertical haar wavelet */
> +    other_sub_id = gl_SubgroupInvocationID ^ dist;
> +    other = subgroupShuffle(value, other_sub_id);
> +    a = gl_SubgroupInvocationID < other_sub_id ? value : other;
> +    b = gl_SubgroupInvocationID < other_sub_id ? other : value;
> +    dst_b = b - a;
> +    dst_a = a + ((dst_b + 1) >> 1);
> +    return gl_SubgroupInvocationID < other_sub_id ? dst_a : dst_b;
> +}
> +
> +bool is_thread_active(int i, ivec2 coord) {
> +    if (i > wavelet_depth - 1) {
> +        return false;
> +    }
> +    ivec2 mask = ivec2((1 << i) - 1);
> +    if (any(notEqual(coord & mask, ivec2(0)))) {
> +        return false;
> +    }
> +    return true;
> +}
> +
> +void main() {
> +    ivec2 tile_coord = ivec2(gl_WorkGroupID.xy);
> +    ivec2 local_coord = ivec2(gl_LocalInvocationIndex & 7, gl_LocalInvocationIndex >> 3);
> +    ivec2 coord = tile_coord * ivec2(TILE_DIM) + local_coord;
> +    ivec2 dwt_dim = planes[plane_idx].dwt_dim;
> +    if (any(greaterThanEqual(coord, dwt_dim))) {
> +        return;
> +    }
> +    int index = dwt_dim.x * coord.y + coord.x;
> +    int value = pbuf[plane_idx].coef_buf[index];
> +
> +    if (gl_SubgroupSize == 64) {
> +        for (int i = 0; i < 3; i++) {
> +            if (!is_thread_active(i, local_coord)) {
> +                break;
> +            }
> +            value = dwt_haar_subgroup(value, i);
> +        }
> +    } else {
> +        for (int i = 0; i < 2; i++) {
> +            if (!is_thread_active(i, local_coord)) {
> +                break;
> +            }
> +            value = dwt_haar_subgroup(value, i);
> +        }
> +    }
> +
> +    // Store value
> +    pbuf[plane_idx].coef_buf[index] = value;
> +}
> \ No newline at end of file
> diff --git a/libavcodec/vulkan/vc2_dwt_hor_legall.comp b/libavcodec/vulkan/vc2_dwt_hor_legall.comp
> new file mode 100644
> index 0000000000..9c3945825e
> --- /dev/null
> +++ b/libavcodec/vulkan/vc2_dwt_hor_legall.comp
> @@ -0,0 +1,61 @@
> +#extension GL_EXT_scalar_block_layout : require
> +#extension GL_EXT_buffer_reference : require
> +
> +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
> +
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef {
> +    int coef_buf[];
> +};
> +
> +struct Plane {
> +    ivec2 dim;
> +    ivec2 dwt_dim;
> +};
> +
> +layout(push_constant, scalar) uniform ComputeInfo {
> +    int s;
> +    int diff_offset;
> +    int level;
> +    Plane planes[3];
> +    DwtCoef pbuf[3];
> +};
> +
> +int align(int x, int a) {
> +    return (x + a - 1) & ~(a - 1);
> +}
> +
> +void main() {
> +    int coord_y = int(gl_GlobalInvocationID.x);
> +    uint plane_idx = gl_GlobalInvocationID.z;
> +    ivec2 work_area = planes[plane_idx].dwt_dim;
> +    int dist = 1 << level;
> +    if (coord_y >= work_area.y || (coord_y & (dist - 1)) != 0) {
> +        return;
> +    }
> +
> +    DwtCoef buf = pbuf[plane_idx];
> +    ivec2 dwt_area = work_area >> 1;
> +    int stride = align(planes[plane_idx].dwt_dim.x, 32);
> +    int start = stride * coord_y;
> +
> +    // Shift in one bit that is used for additional precision
> +    for (int x = 0; x < work_area.x; x += dist) {
> +        buf.coef_buf[start + x] = buf.coef_buf[start + x] << 1;
> +    }
> +
> +    // Lifting stage 2
> +    for (int x = 0; x < work_area.x - 2 * dist; x += 2 * dist) {
> +        buf.coef_buf[start + x + dist] -= (buf.coef_buf[start + x] +
> +                                           buf.coef_buf[start + x + 2 * dist] + 1) >> 1;
> +    }
> +    buf.coef_buf[start + work_area.x - dist] -= (2 * buf.coef_buf[start + work_area.x - 2 * dist] + 1) >> 1;
> +
> +    // Lifting stage 1
> +    buf.coef_buf[start] += (2 * buf.coef_buf[start + dist] + 2) >> 2;
> +    for (int x = 2 * dist; x < work_area.x - 2 * dist; x += 2 * dist) {
> +        buf.coef_buf[start + x] += (buf.coef_buf[start + x - dist] +
> +                                    buf.coef_buf[start + x + dist] + 2) >> 2;
> +    }
> +    buf.coef_buf[start + work_area.x - 2 * dist] += (buf.coef_buf[start + work_area.x - 3 * dist] +
> +                                                     buf.coef_buf[start + work_area.x - dist] + 2) >> 2;
> +}
> \ No newline at end of file
> diff --git a/libavcodec/vulkan/vc2_dwt_legall.comp b/libavcodec/vulkan/vc2_dwt_legall.comp
> new file mode 100644
> index 0000000000..1c640022f5
> --- /dev/null
> +++ b/libavcodec/vulkan/vc2_dwt_legall.comp
> @@ -0,0 +1,74 @@
> +#extension GL_EXT_scalar_block_layout : require
> +#extension GL_EXT_debug_printf : require
> +
> +#define TILE_DIM 16
> +#define LOCAL_X 256
> +
> +layout(local_size_x = TILE_DIM, local_size_y = TILE_DIM, local_size_z = 1) in;
> +
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef {
> +    int coef_buf[];
> +};
> +
> +struct Plane {
> +    ivec2 dim;
> +    ivec2 dwt_dim;
> +};
> +
> +layout(push_constant, scalar) uniform ComputeInfo {
> +    int s;
> +    int plane_idx;
> +    int wavelet_depth;
> +    Plane planes[3];
> +    DwtCoef dst_buf[3];
> +    DwtCoef src_buf[3];
> +};
> +
> +int align(int x, int a) {
> +    return (x + a - 1) & ~(a - 1);
> +}


This should be in vulkan/common.comp

(replace the original slow version there).


> +
> +shared uint local_coef[LOCAL_X];
> +
> +void main() {
> +    ivec2 coord = ivec2(gl_GlobalInvocationID.xy);
> +    ivec2 work_area = planes[plane_idx].dwt_dim;
> +    ivec2 dwt_area = work_area >> 1;
> +    if (any(greaterThanEqual(coord, work_area))) {
> +        return;
> +    }
> +
> +    DwtCoef src = src_buf[plane_idx];
> +    DwtCoef dst = dst_buf[plane_idx];
> +    int stride = align(planes[plane_idx].dwt_dim.x, 32);
> +    int start = stride * coord.y;
> +
> +    for (int i = 0; i < wavelet_depth; i++) {
> +        ivec2 mask = ivec2((1 << i) - 1);
> +        if (any(notEqual(coord & mask, ivec2(0)))) {
> +            break;
> +        }
> +
> +        mask <<= 1;
> +        mask |= 1;
> +
> +        // Shift in one bit that is used for additional precision
> +        for (int x = 0; x < work_area.x; x++) {
> +            dst.coef_buf[start + x] = src.coef_buf[start + x] << 1;
> +        }
> +
> +        // Lifting stage 2
> +        for (int x = 0; x < dwt_area.x - 1; x++) {
> +            dst.coef_buf[start + 2 * x + 1] -= (dst.coef_buf[start + 2 * x] +
> +                                                dst.coef_buf[start + 2 * x + 2] + 1) >> 1;
> +        }
> +        dst.coef_buf[start + work_area.x - 1] -= (2 * dst.coef_buf[start + work_area.x - 2] + 1) >> 1;
> +
> +        // Lifting stage 1
> +        dst.coef_buf[start] += (2 * dst.coef_buf[start + 1] + 2) >> 2;
> +        for (int x = 1; x <= dwt_area.x - 1; x++) {
> +            dst.coef_buf[start + 2 * x] += (dst.coef_buf[start + 2 * x - 1] +
> +                                            dst.coef_buf[start + 2 * x + 1] + 2) >> 2;
> +        }
> +    }
> +}
> \ No newline at end of file
> diff --git a/libavcodec/vulkan/vc2_dwt_upload.comp b/libavcodec/vulkan/vc2_dwt_upload.comp
> new file mode 100644
> index 0000000000..943ebf23d7
> --- /dev/null
> +++ b/libavcodec/vulkan/vc2_dwt_upload.comp
> @@ -0,0 +1,45 @@
> +#extension GL_EXT_scalar_block_layout : require
> +#extension GL_EXT_shader_explicit_arithmetic_types : require
> +#extension GL_EXT_buffer_reference : require
> +
> +layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
> +
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef {
> +    int coef_buf[];
> +};
> +
> +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer PlaneBuf {
> +    uint8_t data[];
> +};
> +
> +struct Plane {
> +    ivec2 dim;
> +    ivec2 dwt_dim;
> +};
> +
> +layout(push_constant, scalar) uniform ComputeInfo {
> +    int s;
> +    int diff_offset;
> +    int level;
> +    Plane planes[3];
> +    DwtCoef pbuf[3];
> +};
> +
> +int align(int x, int a) {
> +    return (x + a - 1) & ~(a - 1);
> +}
> +
> +void main() {
> +    ivec2 coord = ivec2(gl_GlobalInvocationID.xy);
> +    uint plane_idx = gl_GlobalInvocationID.z;
> +    ivec2 work_area = planes[plane_idx].dwt_dim;
> +    if (any(greaterThanEqual(coord, work_area))) {
> +        return;
> +    }
> +    int stride = align(planes[plane_idx].dwt_dim.x, 32);
> +    uint coef_idx = coord.y * stride + coord.x;
> +    ivec2 coord_i = clamp(coord, ivec2(0), planes[plane_idx].dim);
> +    uint texel = imageLoad(plane_imgs[plane_idx], coord_i).x;
> +    int result = int(texel - diff_offset);
> +    pbuf[plane_idx].coef_buf[coef_idx] = result;
> +}
> \ No newline at end of file
> diff --git a/libavcodec/vulkan/vc2_dwt_ver_legall.comp b/libavcodec/vulkan/vc2_dwt_ver_legall.comp
> new file mode 100644
> index 0000000000..6662bd656f
> --- /dev/null
> +++ b/libavcodec/vulkan/vc2_dwt_ver_legall.comp
> @@ -0,0 +1,55 @@
> +#extension GL_EXT_scalar_block_layout : require
> +#extension GL_EXT_buffer_reference : require
> +
> +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;


Always define the layout via ff_vk_shader_init().


> +
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef {
> +    int coef_buf[];
> +};
> +
> +struct Plane {
> +    ivec2 dim;
> +    ivec2 dwt_dim;
> +};
> +
> +layout(push_constant, scalar) uniform ComputeInfo {
> +    int s;
> +    int diff_offset;
> +    int level;
> +    Plane planes[3];
> +    DwtCoef pbuf[3];
> +};
> +
> +int align(int x, int a) {
> +    return (x + a - 1) & ~(a - 1);
> +}
> +
> +void main() {
> +    int coord_x = int(gl_GlobalInvocationID.x);
> +    uint plane_idx = gl_GlobalInvocationID.z;
> +    ivec2 work_area = planes[plane_idx].dwt_dim;
> +    int dist = 1 << level;
> +    if (coord_x >= work_area.x || (coord_x & (dist - 1)) != 0) {
> +        return;
> +    }
> +
> +    DwtCoef buf = pbuf[plane_idx];
> +    ivec2 dwt_area = work_area >> 1;
> +    int stride = align(planes[plane_idx].dwt_dim.x, 32);
> +
> +    // Lifting stage 2
> +    for (int y = dist; y < work_area.y - 2 * dist; y += 2 * dist) {
> +        buf.coef_buf[stride * y + coord_x] -= (buf.coef_buf[stride * (y - dist) + coord_x] +
> +                                               buf.coef_buf[stride * (y + dist) + coord_x] + 1) >> 1;
> +    }
> +    buf.coef_buf[stride * (work_area.y - dist) + coord_x] -= (2 * buf.coef_buf[stride * (work_area.y - 2 * dist) + coord_x] + 1) >> 1;
> +
> +    // Lifting stage 1
> +    buf.coef_buf[coord_x] += (2 * buf.coef_buf[stride * dist + coord_x] + 2) >> 2;
> +    for (int y = 2 * dist; y < work_area.y - 2 * dist; y += 2 * dist) {
> +        buf.coef_buf[stride * y + coord_x] += (buf.coef_buf[stride * (y + dist) + coord_x] +
> +                                               buf.coef_buf[stride * (y - dist) + coord_x] + 2) >> 2;
> +    }
> +    buf.coef_buf[stride * (work_area.y - 2 * dist) + coord_x] += (buf.coef_buf[stride * (work_area.y - 3 * dist) + coord_x] +
> +                                                                  buf.coef_buf[stride * (work_area.y - dist) + coord_x] + 2) >> 2;
> +}


Storage images in particular would speed this up by a longshot.


> \ No newline at end of file
> diff --git a/libavcodec/vulkan/vc2_encode.comp b/libavcodec/vulkan/vc2_encode.comp
> new file mode 100644
> index 0000000000..dc90cee257
> --- /dev/null
> +++ b/libavcodec/vulkan/vc2_encode.comp
> @@ -0,0 +1,204 @@
> +#extension GL_EXT_shader_explicit_arithmetic_types : require
> +#extension GL_EXT_scalar_block_layout : require
> +#extension GL_EXT_buffer_reference : require
> +#extension GL_EXT_debug_printf : require
> +
> +#define WORKGROUP_X 128
> +layout(local_size_x = WORKGROUP_X, local_size_y = 1, local_size_z = 1) in;


Same.


> +
> +#define MAX_DWT_LEVELS (5)
> +
> +struct SliceArgs {
> +    int quant_idx;
> +    int bytes;
> +    int pb_start;
> +    int pad;
> +};
> +
> +struct Plane {
> +    ivec2 dim;
> +    ivec2 dwt_dim;
> +};
> +
> +layout(std430, buffer_reference, buffer_reference_align = 16) buffer SliceArgBuf {
> +    SliceArgs args[];
> +};
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef {
> +    int coef_buf[];
> +};
> +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer BitBuf {
> +    uint data[];
> +};
> +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer BitBufByte {
> +    uint8_t data[];
> +};


Common file...


> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer QuantLuts {
> +    int quant[5][4];
> +    int ff_dirac_qscale_tab[116];
> +};
> +
> +layout(push_constant, scalar) uniform ComputeInfo {
> +    DwtCoef plane_dat[3];
> +    BitBuf bytestream;
> +    QuantLuts luts;
> +    SliceArgBuf slice;
> +    ivec2 num_slices;
> +    Plane planes[3];
> +    int wavelet_depth;
> +    int size_scaler;
> +    int prefix_bytes;
> +};
> +
> +/* Same as skip_put_bytes in put_bits.h but fills in 0xFF */
> +void skip_put_bytes(inout PutBitContext pb, int n) {
> +    int bytes_left = pb.bit_left >> 3;
> +    if (n < bytes_left) {
> +        int n_bits = n << 3;
> +        int mask = (1 << n_bits) - 1;
> +        pb.bit_buf <<= n_bits;
> +        pb.bit_buf |= mask;
> +        pb.bit_left -= uint8_t(n_bits);
> +        return;
> +    }
> +    if (pb.bit_left < BUF_BITS) {
> +        int mask = (1 << pb.bit_left) - 1;
> +        pb.bit_buf <<= pb.bit_left;
> +        pb.bit_buf |= mask;
> +        u32vec2buf(pb.buf).v = BUF_REVERSE(pb.bit_buf);
> +        pb.buf += BUF_BYTES;
> +        n -= pb.bit_left >> 3;
> +    }
> +    int skip_dwords = n >> 2;
> +    while (skip_dwords > 0) {
> +        u32buf(pb.buf).v = 0xFFFFFFFF;
> +        pb.buf += 4;
> +        skip_dwords--;
> +    }
> +    int skip_bits = (n & 3) << 3;
> +    pb.bit_buf = (1 << skip_bits) - 1;
> +    pb.bit_left = uint8_t(BUF_BITS - skip_bits);
> +}


This should be in the common file


> +
> +void put_vc2_ue_uint(inout PutBitContext pb, uint val) {
> +    int pbits = 0, topbit = 1, maxval = 1, bits = 0;
> +    if (val == 0) {
> +        put_bits(pb, 1, 1);
> +        return;
> +    }
> +    val++;
> +
> +    while (val > maxval) {
> +        topbit <<= 1;
> +        bits++;
> +        maxval <<= 1;
> +        maxval |=  1;
> +    }
> +
> +    for (int i = 0; i < bits; i++) {
> +        topbit >>= 1;
> +        pbits <<= 2;
> +        if ((val & topbit) != 0) {
> +            pbits |= 1;
> +        }
> +    }
> +
> +    put_bits(pb, bits * 2 + 1, (pbits << 1) | 1);
> +}
> +
> +int align(int x, int a) {
> +    return (x + a - 1) & ~(a - 1);
> +}
> +
> +int quants[MAX_DWT_LEVELS][4];
> +
> +int subband_coord(int index, int h, int lvl) {
> +    int coord = index;
> +    coord <<= 1;
> +    coord |= h;
> +    coord <<= (wavelet_depth-lvl-1);
> +    return coord;
> +}
> +
> +void main() {
> +    int slice_index = int(gl_GlobalInvocationID.x);
> +    int max_index = num_slices.x * num_slices.y;
> +    if (slice_index >= max_index) {
> +        return;
> +    }
> +
> +    /* Step 2. Quantize and encode */
> +    int pb_start = slice.args[slice_index].pb_start;
> +    for (int i = 0, index = WORKGROUP_X - 1; i < gl_WorkGroupID.x; i++) {
> +        pb_start += slice.args[index].pb_start + slice.args[index].bytes;
> +        index += WORKGROUP_X;
> +    }
> +    ivec2 slice_coord = ivec2(slice_index % num_slices.x, slice_index / num_slices.x);
> +    int slice_bytes_max = slice.args[slice_index].bytes;
> +    int quant_index = slice.args[slice_index].quant_idx;
> +
> +    PutBitContext pb;
> +    init_put_bits(pb, OFFBUF(u8buf, bytestream, pb_start), slice_bytes_max);
> +
> +    for (int level = 0; level < wavelet_depth; level++)
> +        for (int orientation = int(level > 0); orientation < 4; orientation++)
> +            quants[level][orientation] = max(quant_index - luts.quant[level][orientation], 0);
> +
> +    /* Write quant index for this slice */
> +    put_bits(pb, 8, quant_index);
> +
> +    /* Luma + 2 Chroma planes */
> +    for (int p = 0; p < 3; p++) {
> +        int pad_s, pad_c;
> +        int bytes_start = put_bytes_count(pb);
> +
> +        /* Save current location and write a zero value */
> +        uint64_t write_ptr_start = pb.buf;
> +        int bit_left_start = pb.bit_left;
> +        put_bits(pb, 8, 0);
> +        for (int level = 0; level < wavelet_depth; level++) {
> +            ivec2 band_size = planes[p].dwt_dim >> (wavelet_depth - level);
> +            for (int o = int(level > 0); o < 4; o++) {
> +                /* Encode subband */
> +                int left = band_size.x * (slice_coord.x) / num_slices.x;
> +                int right = band_size.x * (slice_coord.x+1) / num_slices.x;
> +                int top = band_size.y * (slice_coord.y) / num_slices.y;
> +                int bottom = band_size.y * (slice_coord.y+1) / num_slices.y;
> +
> +                const int q_idx = quants[level][o];
> +                const int qfactor = luts.ff_dirac_qscale_tab[q_idx];
> +
> +                const int yh = o >> 1;
> +                const int xh = o & 1;
> +
> +                int stride = align(planes[p].dwt_dim.x, 32);
> +                for (int y = top; y < bottom; y++) {
> +                    for (int x = left; x < right; x++) {
> +                        int sx = subband_coord(x, xh, level);
> +                        int sy = subband_coord(y, yh, level);
> +                        int coef = plane_dat[p].coef_buf[sy * stride + sx];
> +                        uint c_abs = uint(abs(coef));
> +                        c_abs = (c_abs << 2) / qfactor;
> +                        put_vc2_ue_uint(pb, c_abs);
> +                        if (c_abs != 0) {
> +                            put_bits(pb, 1, int(coef < 0));
> +                        }
> +                    }
> +                }
> +            }
> +        }
> +        flush_put_bits(pb);
> +        int bytes_len = put_bytes_count(pb) - bytes_start - 1;
> +        if (p == 2) {
> +            int len_diff = slice_bytes_max - put_bytes_count(pb);
> +            pad_s = align((bytes_len + len_diff), size_scaler)/size_scaler;
> +            pad_c = (pad_s*size_scaler) - bytes_len;
> +        } else {
> +            pad_s = align(bytes_len, size_scaler)/size_scaler;
> +            pad_c = (pad_s*size_scaler) - bytes_len;
> +        }
> +        uint64_t start_ptr = write_ptr_start + ((BUF_BITS - bit_left_start) >> 3);
> +        u8buf(start_ptr).v = uint8_t(pad_s);
> +        /* vc2-reference uses that padding that decodes to '0' coeffs */
> +        skip_put_bytes(pb, pad_c);
> +    }
> +}
> \ No newline at end of file
> diff --git a/libavcodec/vulkan/vc2_slice_sizes.comp b/libavcodec/vulkan/vc2_slice_sizes.comp
> new file mode 100644
> index 0000000000..a965e911c7
> --- /dev/null
> +++ b/libavcodec/vulkan/vc2_slice_sizes.comp
> @@ -0,0 +1,184 @@
> +#extension GL_EXT_shader_explicit_arithmetic_types : require
> +#extension GL_EXT_scalar_block_layout : require
> +#extension GL_EXT_buffer_reference : require
> +
> +#define WORKGROUP_X 128
> +layout(local_size_x = WORKGROUP_X, local_size_y = 1, local_size_z = 1) in;
> +
> +#define DIRAC_MAX_QUANT_INDEX 116
> +#define MAX_DWT_LEVELS 5
> +
> +struct SliceArgs {
> +    int quant_idx;
> +    int bytes;
> +    int pb_start;
> +    int pad;
> +};
> +
> +struct Plane {
> +    ivec2 dim;
> +    ivec2 dwt_dim;
> +};
> +
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer DwtCoef {
> +    int coef_buf[];
> +};
> +
> +layout(std430, buffer_reference) buffer SliceArgBuf {
> +    SliceArgs args[];
> +};
> +
> +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer QuantLuts {
> +    int quant[5][4];
> +    int ff_dirac_qscale_tab[116];
> +};
> +
> +layout(push_constant, scalar) uniform ComputeInfo {
> +    DwtCoef plane_dat[3];
> +    QuantLuts luts;
> +    SliceArgBuf slice;
> +    ivec2 num_slices;
> +    Plane planes[3];
> +    int wavelet_depth;
> +    int size_scaler;
> +    int prefix_bytes;
> +    int bits_ceil;
> +    int bits_floor;
> +};
> +
> +int count_vc2_ue_uint(uint val) {
> +    uint topbit = 1, maxval = 1;
> +    int bits = 0;
> +    if (val == 0) {
> +        return 1;
> +    }
> +    val++;
> +    while (val > maxval) {
> +        bits++;
> +        topbit <<= 1;
> +        maxval <<= 1;
> +        maxval |=  1;
> +    }
> +    return bits * 2 + 1;
> +}
> +
> +int ffalign(int x, int a) {
> +    return (x + a - 1) & ~(a - 1);
> +}
> +
> +int cache[DIRAC_MAX_QUANT_INDEX];
> +int quants[MAX_DWT_LEVELS][4];
> +shared int slice_sizes[WORKGROUP_X];
> +
> +int subband_coord(int index, int h, int lvl) {
> +    int coord = index;
> +    coord <<= 1;
> +    coord |= h;
> +    coord <<= (wavelet_depth-lvl-1);
> +    return coord;
> +}
> +
> +int count_hq_slice(int quant_index) {
> +    int bits = 0;
> +    if (cache[quant_index] != 0) {
> +        return cache[quant_index];
> +    }
> +
> +    bits += 8*prefix_bytes;
> +    bits += 8; /* quant_idx */
> +
> +    for (int level = 0; level < wavelet_depth; level++)
> +        for (int orientation = int(level > 0); orientation < 4; orientation++)
> +            quants[level][orientation] = max(quant_index - luts.quant[level][orientation], 0);
> +
> +    int slice_index = int(gl_GlobalInvocationID.x);
> +    ivec2 slice_coord = ivec2(slice_index % num_slices.x, slice_index / num_slices.x);
> +    for (int p = 0; p < 3; p++) {
> +        int bytes_start = bits >> 3;
> +        bits += 8;
> +        for (int level = 0; level < wavelet_depth; level++) {
> +            ivec2 band_dim = planes[p].dwt_dim >> (wavelet_depth - level);
> +            for (int o = int(level > 0); o < 4; o++) {
> +                const int left = band_dim.x * slice_coord.x / num_slices.x;
> +                const int right = band_dim.x * (slice_coord.x+1) / num_slices.x;
> +                const int top = band_dim.y * slice_coord.y / num_slices.y;
> +                const int bottom = band_dim.y * (slice_coord.y+1) / num_slices.y;
> +
> +                const int q_idx = quants[level][o];
> +                const int qfactor = luts.ff_dirac_qscale_tab[q_idx];
> +
> +                const int yh = o >> 1;
> +                const int xh = o & 1;
> +
> +                const int stride = ffalign(planes[p].dwt_dim.x, 32);
> +                for (int y = top; y < bottom; y++) {
> +                    for (int x = left; x < right; x++) {
> +                        int sx = subband_coord(x, xh, level);
> +                        int sy = subband_coord(y, yh, level);
> +                        int coef = plane_dat[p].coef_buf[sy * stride + sx];
> +                        uint c_abs = uint(abs(coef));
> +                        c_abs = (c_abs << 2) / qfactor;
> +                        bits += count_vc2_ue_uint(c_abs);
> +                        bits += int(c_abs > 0);
> +                    }
> +                }
> +            }
> +        }
> +        bits += ffalign(bits, 8) - bits;
> +        int bytes_len = (bits >> 3) - bytes_start - 1;
> +        int pad_s = ffalign(bytes_len, size_scaler) / size_scaler;
> +        int pad_c = (pad_s * size_scaler) - bytes_len;
> +        bits += pad_c * 8;
> +    }
> +
> +    cache[quant_index] = bits;
> +    return bits;
> +}
> +
> +int ssize_round(int b) {
> +    return ffalign(b, size_scaler) + 4 + prefix_bytes;
> +}
> +
> +void main() {
> +    int slice_index = int(gl_GlobalInvocationID.x);
> +    int max_index = num_slices.x * num_slices.y;
> +    if (slice_index >= max_index) {
> +        return;
> +    }


Try to avoid branches. Just overallocate the amount of slices by a couple.


> +    for (int i = 0; i < DIRAC_MAX_QUANT_INDEX; i++) {
> +        cache[i] = 0;
> +    }
> +    const int q_ceil = DIRAC_MAX_QUANT_INDEX;
> +    const int top = bits_ceil;
> +    const int bottom = bits_floor;
> +    int quant_buf[2] = int[2](-1, -1);
> +    int quant = slice.args[slice_index].quant_idx;
> +    int step = 1;
> +    int bits_last = 0;
> +    int bits = count_hq_slice(quant);
> +    while ((bits > top) || (bits < bottom)) {
> +        const int signed_step = bits > top ? +step : -step;
> +        quant = clamp(quant + signed_step, 0, q_ceil-1);
> +        bits = count_hq_slice(quant);
> +        if (quant_buf[1] == quant) {
> +            quant = max(quant_buf[0], quant);
> +            bits = quant == quant_buf[0] ? bits_last : bits;
> +            break;
> +        }
> +        step = clamp(step / 2, 1, (q_ceil - 1) / 2);
> +        quant_buf[1] = quant_buf[0];
> +        quant_buf[0] = quant;
> +        bits_last = bits;
> +    }
> +    int bytes = ssize_round(bits >> 3);
> +    slice.args[slice_index].quant_idx = clamp(quant, 0, q_ceil-1);
> +    slice.args[slice_index].bytes = bytes;
> +    slice_sizes[gl_LocalInvocationIndex] = bytes;
> +    barrier();
> +    /* Prefix sum for all slices in current workgroup */
> +    int total_bytes = 0;
> +    for (int i = 0; i < gl_LocalInvocationIndex; i++) {
> +        total_bytes += slice_sizes[i];
> +    }
> +    slice.args[slice_index].pb_start = total_bytes;
> +}
> \ No newline at end of file


Missing newline at the end of the file.


Apart from these nits, it looks really good, and seems to work so far on 
my machine.


More information about the ffmpeg-devel mailing list