[FFmpeg-devel] [PATCH] cbs_h2645: Improve performance of writing slices

Mark Thompson sw at jkqxz.net
Mon Nov 12 02:19:37 EET 2018


On 11/11/18 22:43, Andreas Rheinhardt wrote:
> Instead of using a combination of bitreader and -writer for copying data,
> one can byte-align the (obsolete and removed) bitreader to improve performance.
> With the right alignment one can even use memcpy. The right alignment
> normally exists for CABAC and hence for H.265 in general.
> For aligned data this reduced the time to copy the slicedata from
> 776520 decicycles to 33889 with 262144 runs and a 6.5mb/s H.264 video.
> For unaligned data the number went down from 279196 to 97739 decicycles.
> ---
>  libavcodec/cbs_h2645.c | 119 ++++++++++++++++++++++++-----------------
>  1 file changed, 69 insertions(+), 50 deletions(-)
> 
> diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
> index e55bd00183..416d3fd32a 100644
> --- a/libavcodec/cbs_h2645.c
> +++ b/libavcodec/cbs_h2645.c
> @@ -1050,6 +1050,64 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
>      return 0;
>  }
>  
> +static int cbs_h2645_write_slice_data(CodedBitstreamContext *ctx,
> +                                      PutBitContext *pbc, const uint8_t *data,
> +                                      size_t data_size, int data_bit_start)
> +{
> +    size_t rest  = data_size - (data_bit_start + 7) / 8;
> +    const uint8_t *pos = data + data_bit_start / 8;
> +
> +    av_assert0(data_bit_start >= 0 &&
> +               8 * data_size > data_bit_start);
> +
> +    if (data_size * 8 + 8 > put_bits_left(pbc))
> +        return AVERROR(ENOSPC);
> +
> +    if (!rest)
> +        goto rbsp_stop_one_bit;
> +
> +    // First copy the remaining bits of the first byte
> +    // The above check ensures that we do not accidentally
> +    // copy beyond the rbsp_stop_one_bit.
> +    if (data_bit_start % 8)
> +        put_bits(pbc, 8 - data_bit_start % 8,
> +                *pos++ & MAX_UINT_BITS(8 - data_bit_start % 8));
> +
> +    if (put_bits_count(pbc) % 8 == 0) {
> +        // If the writer is aligned at this point,
> +        // memcpy can be used to improve performance.
> +        // This happens normally for CABAC.
> +        flush_put_bits(pbc);
> +        memcpy(put_bits_ptr(pbc), pos, rest);
> +        skip_put_bytes(pbc, rest);
> +    } else {
> +        // If not, we have to copy manually.
> +        // rbsp_stop_one_bit forces us to special-case
> +        // the last byte.
> +        uint8_t temp;
> +        int i;
> +
> +        for (; rest > 4; rest -= 4, pos += 4)
> +            put_bits32(pbc, AV_RB32(pos));
> +
> +        for (; rest > 1; rest--, pos++)
> +            put_bits(pbc, 8, *pos);
> +
> +    rbsp_stop_one_bit:
> +        temp = rest ? *pos : *pos & MAX_UINT_BITS(8 - data_bit_start % 8);
> +
> +        av_assert0(temp);
> +        i = ff_ctz(*pos);
> +        temp = temp >> i;
> +        i = rest ? (8 - i) : (8 - i - data_bit_start % 8);
> +        put_bits(pbc, i, temp);
> +        if (put_bits_count(pbc) % 8)
> +            put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
> +    }
> +
> +    return 0;
> +}
> +
>  static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
>                                     CodedBitstreamUnit *unit,
>                                     PutBitContext *pbc)
> @@ -1100,37 +1158,17 @@ static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
>      case H264_NAL_AUXILIARY_SLICE:
>          {
>              H264RawSlice *slice = unit->content;
> -            GetBitContext gbc;
> -            int bits_left, end, zeroes;
>  
>              err = cbs_h264_write_slice_header(ctx, pbc, &slice->header);
>              if (err < 0)
>                  return err;
>  
>              if (slice->data) {
> -                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
> -                    return AVERROR(ENOSPC);
> -
> -                init_get_bits(&gbc, slice->data, slice->data_size * 8);
> -                skip_bits_long(&gbc, slice->data_bit_start);
> -
> -                // Copy in two-byte blocks, but stop before copying the
> -                // rbsp_stop_one_bit in the final byte.
> -                while (get_bits_left(&gbc) > 23)
> -                    put_bits(pbc, 16, get_bits(&gbc, 16));
> -
> -                bits_left = get_bits_left(&gbc);
> -                end = get_bits(&gbc, bits_left);
> -
> -                // rbsp_stop_one_bit must be present here.
> -                av_assert0(end);
> -                zeroes = ff_ctz(end);
> -                if (bits_left > zeroes + 1)
> -                    put_bits(pbc, bits_left - zeroes - 1,
> -                             end >> (zeroes + 1));
> -                put_bits(pbc, 1, 1);
> -                while (put_bits_count(pbc) % 8 != 0)
> -                    put_bits(pbc, 1, 0);
> +                err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
> +                                                 slice->data_size,
> +                                                 slice->data_bit_start);
> +                if (err < 0)
> +                    return err;
>              } else {
>                  // No slice data - that was just the header.
>                  // (Bitstream may be unaligned!)
> @@ -1254,39 +1292,20 @@ static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
>      case HEVC_NAL_CRA_NUT:
>          {
>              H265RawSlice *slice = unit->content;
> -            GetBitContext gbc;
> -            int bits_left, end, zeroes;
>  
>              err = cbs_h265_write_slice_segment_header(ctx, pbc, &slice->header);
>              if (err < 0)
>                  return err;
>  
>              if (slice->data) {
> -                if (slice->data_size * 8 + 8 > put_bits_left(pbc))
> -                    return AVERROR(ENOSPC);
> -
> -                init_get_bits(&gbc, slice->data, slice->data_size * 8);
> -                skip_bits_long(&gbc, slice->data_bit_start);
> -
> -                // Copy in two-byte blocks, but stop before copying the
> -                // rbsp_stop_one_bit in the final byte.
> -                while (get_bits_left(&gbc) > 23)
> -                    put_bits(pbc, 16, get_bits(&gbc, 16));
> -
> -                bits_left = get_bits_left(&gbc);
> -                end = get_bits(&gbc, bits_left);
> -
> -                // rbsp_stop_one_bit must be present here.
> -                av_assert0(end);
> -                zeroes = ff_ctz(end);
> -                if (bits_left > zeroes + 1)
> -                    put_bits(pbc, bits_left - zeroes - 1,
> -                             end >> (zeroes + 1));
> -                put_bits(pbc, 1, 1);
> -                while (put_bits_count(pbc) % 8 != 0)
> -                    put_bits(pbc, 1, 0);
> +                err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
> +                                                 slice->data_size,
> +                                                 slice->data_bit_start);
> +                if (err < 0)
> +                    return err;
>              } else {
>                  // No slice data - that was just the header.
> +                // (Bitstream may be unaligned!)

This comment change isn't accurate - the bitstream will always be aligned for H.265 in this case because byte alignment is included at the end of the slice segment header.  (I've just removed it.)

>              }
>          }
>          break;
> 

LGTM, tested, applied.

Thanks!

- Mark


On 11/11/18 22:32, Andreas Rheinhardt wrote:
> ...  Btw: What was the normal speedup you got when copying in the aligned mode?

Macro-level tests come out very well here.

Test files:
A  900 frames of single-slice 4K H.265, 1.5GB (900 writes of slice data, averaging 1.6MB each).
B  38074 frames of 32-slice 1080p H.264, 1.5GB (1.2m writes of slice data, averaging 1.2kB each).

In each case, get input file into memory, "./ffmpeg -i input-file -c:v copy -bsf:v hxxx_metadata -f null -", run five times and average the result.

Intel 8700 (Coffee Lake):
A    ~83fps ->  ~274fps
B  ~5150fps -> ~8160fps

Rockchip 3288 (Cortex A15):
A    ~31fps ->   ~48fps
B  ~1210fps -> ~1700fps

So, around 50% increase in hxxx_metadata throughput for these cases.  (And I guess Intel is very good at large memcpy for the first one.)


More information about the ffmpeg-devel mailing list