[FFmpeg-devel] [PATCH] cbs_h2645: Improve performance of writing slices
Mark Thompson
sw at jkqxz.net
Mon Nov 12 02:19:37 EET 2018
On 11/11/18 22:43, Andreas Rheinhardt wrote:
> Instead of using a combination of bitreader and -writer for copying data,
> one can byte-align the (obsolete and removed) bitreader to improve performance.
> With the right alignment one can even use memcpy. The right alignment
> normally exists for CABAC and hence for H.265 in general.
> For aligned data this reduced the time to copy the slicedata from
> 776520 decicycles to 33889 with 262144 runs and a 6.5mb/s H.264 video.
> For unaligned data the number went down from 279196 to 97739 decicycles.
> ---
> libavcodec/cbs_h2645.c | 119 ++++++++++++++++++++++++-----------------
> 1 file changed, 69 insertions(+), 50 deletions(-)
>
> diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c
> index e55bd00183..416d3fd32a 100644
> --- a/libavcodec/cbs_h2645.c
> +++ b/libavcodec/cbs_h2645.c
> @@ -1050,6 +1050,64 @@ static int cbs_h265_read_nal_unit(CodedBitstreamContext *ctx,
> return 0;
> }
>
> +static int cbs_h2645_write_slice_data(CodedBitstreamContext *ctx,
> + PutBitContext *pbc, const uint8_t *data,
> + size_t data_size, int data_bit_start)
> +{
> + size_t rest = data_size - (data_bit_start + 7) / 8;
> + const uint8_t *pos = data + data_bit_start / 8;
> +
> + av_assert0(data_bit_start >= 0 &&
> + 8 * data_size > data_bit_start);
> +
> + if (data_size * 8 + 8 > put_bits_left(pbc))
> + return AVERROR(ENOSPC);
> +
> + if (!rest)
> + goto rbsp_stop_one_bit;
> +
> + // First copy the remaining bits of the first byte
> + // The above check ensures that we do not accidentally
> + // copy beyond the rbsp_stop_one_bit.
> + if (data_bit_start % 8)
> + put_bits(pbc, 8 - data_bit_start % 8,
> + *pos++ & MAX_UINT_BITS(8 - data_bit_start % 8));
> +
> + if (put_bits_count(pbc) % 8 == 0) {
> + // If the writer is aligned at this point,
> + // memcpy can be used to improve performance.
> + // This happens normally for CABAC.
> + flush_put_bits(pbc);
> + memcpy(put_bits_ptr(pbc), pos, rest);
> + skip_put_bytes(pbc, rest);
> + } else {
> + // If not, we have to copy manually.
> + // rbsp_stop_one_bit forces us to special-case
> + // the last byte.
> + uint8_t temp;
> + int i;
> +
> + for (; rest > 4; rest -= 4, pos += 4)
> + put_bits32(pbc, AV_RB32(pos));
> +
> + for (; rest > 1; rest--, pos++)
> + put_bits(pbc, 8, *pos);
> +
> + rbsp_stop_one_bit:
> + temp = rest ? *pos : *pos & MAX_UINT_BITS(8 - data_bit_start % 8);
> +
> + av_assert0(temp);
> + i = ff_ctz(*pos);
> + temp = temp >> i;
> + i = rest ? (8 - i) : (8 - i - data_bit_start % 8);
> + put_bits(pbc, i, temp);
> + if (put_bits_count(pbc) % 8)
> + put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U);
> + }
> +
> + return 0;
> +}
> +
> static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
> CodedBitstreamUnit *unit,
> PutBitContext *pbc)
> @@ -1100,37 +1158,17 @@ static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx,
> case H264_NAL_AUXILIARY_SLICE:
> {
> H264RawSlice *slice = unit->content;
> - GetBitContext gbc;
> - int bits_left, end, zeroes;
>
> err = cbs_h264_write_slice_header(ctx, pbc, &slice->header);
> if (err < 0)
> return err;
>
> if (slice->data) {
> - if (slice->data_size * 8 + 8 > put_bits_left(pbc))
> - return AVERROR(ENOSPC);
> -
> - init_get_bits(&gbc, slice->data, slice->data_size * 8);
> - skip_bits_long(&gbc, slice->data_bit_start);
> -
> - // Copy in two-byte blocks, but stop before copying the
> - // rbsp_stop_one_bit in the final byte.
> - while (get_bits_left(&gbc) > 23)
> - put_bits(pbc, 16, get_bits(&gbc, 16));
> -
> - bits_left = get_bits_left(&gbc);
> - end = get_bits(&gbc, bits_left);
> -
> - // rbsp_stop_one_bit must be present here.
> - av_assert0(end);
> - zeroes = ff_ctz(end);
> - if (bits_left > zeroes + 1)
> - put_bits(pbc, bits_left - zeroes - 1,
> - end >> (zeroes + 1));
> - put_bits(pbc, 1, 1);
> - while (put_bits_count(pbc) % 8 != 0)
> - put_bits(pbc, 1, 0);
> + err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
> + slice->data_size,
> + slice->data_bit_start);
> + if (err < 0)
> + return err;
> } else {
> // No slice data - that was just the header.
> // (Bitstream may be unaligned!)
> @@ -1254,39 +1292,20 @@ static int cbs_h265_write_nal_unit(CodedBitstreamContext *ctx,
> case HEVC_NAL_CRA_NUT:
> {
> H265RawSlice *slice = unit->content;
> - GetBitContext gbc;
> - int bits_left, end, zeroes;
>
> err = cbs_h265_write_slice_segment_header(ctx, pbc, &slice->header);
> if (err < 0)
> return err;
>
> if (slice->data) {
> - if (slice->data_size * 8 + 8 > put_bits_left(pbc))
> - return AVERROR(ENOSPC);
> -
> - init_get_bits(&gbc, slice->data, slice->data_size * 8);
> - skip_bits_long(&gbc, slice->data_bit_start);
> -
> - // Copy in two-byte blocks, but stop before copying the
> - // rbsp_stop_one_bit in the final byte.
> - while (get_bits_left(&gbc) > 23)
> - put_bits(pbc, 16, get_bits(&gbc, 16));
> -
> - bits_left = get_bits_left(&gbc);
> - end = get_bits(&gbc, bits_left);
> -
> - // rbsp_stop_one_bit must be present here.
> - av_assert0(end);
> - zeroes = ff_ctz(end);
> - if (bits_left > zeroes + 1)
> - put_bits(pbc, bits_left - zeroes - 1,
> - end >> (zeroes + 1));
> - put_bits(pbc, 1, 1);
> - while (put_bits_count(pbc) % 8 != 0)
> - put_bits(pbc, 1, 0);
> + err = cbs_h2645_write_slice_data(ctx, pbc, slice->data,
> + slice->data_size,
> + slice->data_bit_start);
> + if (err < 0)
> + return err;
> } else {
> // No slice data - that was just the header.
> + // (Bitstream may be unaligned!)
This comment change isn't accurate - the bitstream will always be aligned for H.265 in this case because byte alignment is included at the end of the slice segment header. (I've just removed it.)
> }
> }
> break;
>
LGTM, tested, applied.
Thanks!
- Mark
On 11/11/18 22:32, Andreas Rheinhardt wrote:
> ... Btw: What was the normal speedup you got when copying in the aligned mode?
Macro-level tests come out very well here.
Test files:
A 900 frames of single-slice 4K H.265, 1.5GB (900 writes of slice data, averaging 1.6MB each).
B 38074 frames of 32-slice 1080p H.264, 1.5GB (1.2m writes of slice data, averaging 1.2kB each).
In each case, get input file into memory, "./ffmpeg -i input-file -c:v copy -bsf:v hxxx_metadata -f null -", run five times and average the result.
Intel 8700 (Coffee Lake):
A ~83fps -> ~274fps
B ~5150fps -> ~8160fps
Rockchip 3288 (Cortex A15):
A ~31fps -> ~48fps
B ~1210fps -> ~1700fps
So, around 50% increase in hxxx_metadata throughput for these cases. (And I guess Intel is very good at large memcpy for the first one.)
More information about the ffmpeg-devel
mailing list