[FFmpeg-devel] [PATCH] avfilter/vf_transpose: rewrite for x86 SIMD

Paul B Mahol onemda at gmail.com
Wed Aug 16 10:00:36 EEST 2017


On 8/16/17, James Almer <jamrial at gmail.com> wrote:
> On 8/15/2017 5:25 PM, Paul B Mahol wrote:
>> From f25f0022fbc675affd65b95f097fa62e55788a37 Mon Sep 17 00:00:00 2001
>> From: Paul B Mahol <onemda at gmail.com>
>> Date: Tue, 15 Aug 2017 20:12:32 +0200
>> Subject: [PATCH] avfilter/vf_transpose: rewrite for x86 SIMD
>>
>> Transpose first in chunks of 8x8 blocks.
>> 15% faster overall.
>> ---
>>  libavfilter/vf_transpose.c | 184
>> +++++++++++++++++++++++++++++++++++----------
>>  1 file changed, 143 insertions(+), 41 deletions(-)
>>
>> diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c
>> index 75b4dda41f..2fa751c925 100644
>> --- a/libavfilter/vf_transpose.c
>> +++ b/libavfilter/vf_transpose.c
>> @@ -58,6 +58,12 @@ typedef struct TransContext {
>>
>>      int passthrough;    ///< PassthroughType, landscape passthrough mode
>> enabled
>>      int dir;            ///< TransposeDir
>> +
>> +    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
>> +                          uint8_t *dst, ptrdiff_t dst_linesize);
>> +    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
>> +                            uint8_t *dst, ptrdiff_t dst_linesize,
>> +                            int w, int h);
>>  } TransContext;
>>
>>  static int query_formats(AVFilterContext *ctx)
>> @@ -79,6 +85,109 @@ static int query_formats(AVFilterContext *ctx)
>>      return ff_set_common_formats(ctx, pix_fmts);
>>  }
>>
>> +static av_always_inline void transpose_block_8_c(uint8_t *src, ptrdiff_t
>> src_linesize,
>
> Is always_inline needed? Shouldn't inline be enough for the 8x8 functions?

Changed.

>
>> +                                                 uint8_t *dst, ptrdiff_t
>> dst_linesize,
>> +                                                 int w, int h)
>> +{
>> +    int x, y;
>> +    for (y = 0; y < h; y++, dst += dst_linesize, src++)
>> +        for (x = 0; x < w; x++)
>> +            dst[x] = src[x*src_linesize];
>> +}
>> +
>> +static void transpose_8x8_8_c(uint8_t *src, ptrdiff_t src_linesize,
>> +                              uint8_t *dst, ptrdiff_t dst_linesize)
>> +{
>> +    transpose_block_8_c(src, src_linesize, dst, dst_linesize, 8, 8);
>> +}
>> +
>> +static av_always_inline void transpose_block_16_c(uint8_t *src, ptrdiff_t
>> src_linesize,
>> +                                                  uint8_t *dst, ptrdiff_t
>> dst_linesize,
>> +                                                  int w, int h)
>> +{
>> +    int x, y;
>> +    for (y = 0; y < h; y++, dst += dst_linesize, src += 2)
>> +        for (x = 0; x < w; x++)
>> +            *((uint16_t *)(dst + 2*x)) = *((uint16_t *)(src +
>> x*src_linesize));
>
> Use local uint16_t* pointers instead of casting inside the loop. It will
> probably make no difference for compilers but it helps readability.
>
> Same for the cases below.
>

For src case its not trivial, feel free to do it in another commit.

>> +}
>> +
>> +static void transpose_8x8_16_c(uint8_t *src, ptrdiff_t src_linesize,
>> +                               uint8_t *dst, ptrdiff_t dst_linesize)
>> +{
>> +    transpose_block_16_c(src, src_linesize, dst, dst_linesize, 8, 8);
>> +}
>> +
>> +static av_always_inline void transpose_block_24_c(uint8_t *src, ptrdiff_t
>> src_linesize,
>> +                                                  uint8_t *dst, ptrdiff_t
>> dst_linesize,
>> +                                                  int w, int h)
>> +{
>> +    int x, y;
>> +    for (y = 0; y < h; y++, dst += dst_linesize) {
>> +        for (x = 0; x < w; x++) {
>> +            int32_t v = AV_RB24(src + x*src_linesize + y*3);
>> +            AV_WB24(dst + 3*x, v);
>> +        }
>> +    }
>> +}
>> +
>> +static void transpose_8x8_24_c(uint8_t *src, ptrdiff_t src_linesize,
>> +                               uint8_t *dst, ptrdiff_t dst_linesize)
>> +{
>> +    transpose_block_24_c(src, src_linesize, dst, dst_linesize, 8, 8);
>> +}
>> +
>> +static av_always_inline void transpose_block_32_c(uint8_t *src, ptrdiff_t
>> src_linesize,
>> +                                                  uint8_t *dst, ptrdiff_t
>> dst_linesize,
>> +                                                  int w, int h)
>> +{
>> +    int x, y;
>> +    for (y = 0; y < h; y++, dst += dst_linesize, src += 4) {
>> +        for (x = 0; x < w; x++)
>> +            *((uint32_t *)(dst + 4*x)) = *((uint32_t *)(src +
>> x*src_linesize));
>> +    }
>> +}
>> +
>> +static void transpose_8x8_32_c(uint8_t *src, ptrdiff_t src_linesize,
>> +                               uint8_t *dst, ptrdiff_t dst_linesize)
>> +{
>> +    transpose_block_32_c(src, src_linesize, dst, dst_linesize, 8, 8);
>> +}
>> +
>> +static av_always_inline void transpose_block_48_c(uint8_t *src, ptrdiff_t
>> src_linesize,
>> +                                                  uint8_t *dst, ptrdiff_t
>> dst_linesize,
>> +                                                  int w, int h)
>> +{
>> +    int x, y;
>> +    for (y = 0; y < h; y++, dst += dst_linesize, src += 6) {
>> +        for (x = 0; x < w; x++) {
>> +            int64_t v = AV_RB48(src + x*src_linesize);
>> +            AV_WB48(dst + 6*x, v);
>> +        }
>> +    }
>> +}
>> +
>> +static void transpose_8x8_48_c(uint8_t *src, ptrdiff_t src_linesize,
>> +                               uint8_t *dst, ptrdiff_t dst_linesize)
>> +{
>> +    transpose_block_48_c(src, src_linesize, dst, dst_linesize, 8, 8);
>> +}
>> +
>> +static av_always_inline void transpose_block_64_c(uint8_t *src, ptrdiff_t
>> src_linesize,
>> +                                                  uint8_t *dst, ptrdiff_t
>> dst_linesize,
>> +                                                  int w, int h)
>> +{
>> +    int x, y;
>> +    for (y = 0; y < h; y++, dst += dst_linesize, src += 8)
>> +        for (x = 0; x < w; x++)
>> +            *((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src +
>> x*src_linesize));
>> +}
>> +
>> +static void transpose_8x8_64_c(uint8_t *src, ptrdiff_t src_linesize,
>> +                               uint8_t *dst, ptrdiff_t dst_linesize)
>> +{
>> +    transpose_block_64_c(src, src_linesize, dst, dst_linesize, 8, 8);
>> +}
>> +
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>


More information about the ffmpeg-devel mailing list