[FFmpeg-devel] [PATCH v2] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

Michael Niedermayer michael at niedermayer.cc
Mon Jan 7 21:08:52 EET 2019


On Mon, Jan 07, 2019 at 10:01:22AM +0200, Lauri Kasanen wrote:
> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \
> -s 1920x1728 -f null -vframes 100 -v error -nostats -
> 
> 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
> Fate passes, each format tested with an image to video conversion.
> 
> Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
> of the 16-bit function. This includes the vec_mulo/mule functions too,
> not just vmuluwm.
> 
> yuv420p9le
>   12341 UNITS in planarX,  130976 runs,     96 skips
>   73752 UNITS in planarX,  131066 runs,      6 skips
> yuv420p9be
>   12364 UNITS in planarX,  131025 runs,     47 skips
>   73001 UNITS in planarX,  131055 runs,     17 skips
> yuv420p10le
>   12386 UNITS in planarX,  131042 runs,     30 skips
>   72735 UNITS in planarX,  131062 runs,     10 skips
> yuv420p10be
>   12337 UNITS in planarX,  131045 runs,     27 skips
>   72734 UNITS in planarX,  131057 runs,     15 skips
> yuv420p12le
>   12236 UNITS in planarX,  131058 runs,     14 skips
>   73029 UNITS in planarX,  131062 runs,     10 skips
> yuv420p12be
>   12218 UNITS in planarX,  130973 runs,     99 skips
>   72402 UNITS in planarX,  131069 runs,      3 skips
> yuv420p14le
>   12168 UNITS in planarX,  131067 runs,      5 skips
>   72480 UNITS in planarX,  131069 runs,      3 skips
> yuv420p14be
>   12358 UNITS in planarX,  130948 runs,    124 skips
>   73772 UNITS in planarX,  131063 runs,      9 skips
> yuv420p16le
>   10439 UNITS in planarX,  130911 runs,    161 skips
>  157923 UNITS in planarX,  131068 runs,      4 skips
> yuv420p16be
>   10463 UNITS in planarX,  130874 runs,    198 skips
>  154405 UNITS in planarX,  131061 runs,     11 skips
> 
> Signed-off-by: Lauri Kasanen <cand at gmx.com>
> ---
> 
> v2: Separate macros so that yuv2plane1_16_vsx remains available for power7
> 
>  libswscale/ppc/swscale_ppc_template.c |   4 +-
>  libswscale/ppc/swscale_vsx.c          | 190 +++++++++++++++++++++++++++++++++-
>  2 files changed, 189 insertions(+), 5 deletions(-)
> 
> diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c
> index 00e4b99..11decab 100644
> --- a/libswscale/ppc/swscale_ppc_template.c
> +++ b/libswscale/ppc/swscale_ppc_template.c
> @@ -21,7 +21,7 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>   */
>  
> -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
> +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
>                                    const int16_t **src, uint8_t *dest,
>                                    const uint8_t *dither, int offset, int x)
>  {
> @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
>      yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
>  
>      for (i = dst_u; i < dstW - 15; i += 16)
> -        FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
> +        FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
>                                offset, i);
>  
>      yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
> diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
> index 70da6ae..1fd392e 100644
> --- a/libswscale/ppc/swscale_vsx.c
> +++ b/libswscale/ppc/swscale_vsx.c
> @@ -83,6 +83,8 @@
>  #include "swscale_ppc_template.c"
>  #undef FUNC
>  
> +#undef vzero
> +
>  #endif /* !HAVE_BIGENDIAN */
>  
>  static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
> @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW,
>      yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
>  }
>  
> +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
> +                              const int16_t **src, uint16_t *dest, int dstW,
> +                              int big_endian, int output_bits, int start)
> +{
> +    int i;
> +    int shift = 11 + 16 - output_bits;
> +
> +    for (i = start; i < dstW; i++) {
> +        int val = 1 << (shift - 1);
> +        int j;
> +
> +        for (j = 0; j < filterSize; j++)
> +            val += src[j][i] * filter[j];
> +
> +        output_pixel(&dest[i], val);
> +    }
> +}
> +
> +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
> +                                const int16_t **src, uint16_t *dest, int dstW,
> +                                int big_endian, int output_bits)
> +{
> +    const int dst_u = -(uintptr_t)dest & 7;
> +    const int shift = 11 + 16 - output_bits;
> +    const int add = (1 << (shift - 1));
> +    const int clip = (1 << output_bits) - 1;
> +    const uint16_t swap = big_endian ? 8 : 0;
> +    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
> +    const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
> +    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
> +    const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
> +    const vector int16_t vzero = vec_splat_s16(0);
> +    const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
> +    vector int16_t vfilter[MAX_FILTER_SIZE], vin;
> +    vector uint16_t v;
> +    vector uint32_t vleft, vright, vtmp;
> +    int i, j;
> +
> +    for (i = 0; i < filterSize; i++) {
> +        vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], filter[i],
> +                                       filter[i], filter[i], filter[i], filter[i]};
> +    }
> +
> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
> +
> +    for (i = dst_u; i < dstW - 7; i += 8) {
> +        vleft = vright = vadd;
> +
> +        for (j = 0; j < filterSize; j++) {
> +            vin = vec_vsx_ld(0, &src[j][i]);
> +            vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
> +            vleft = vec_add(vleft, vtmp);
> +            vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
> +            vright = vec_add(vright, vtmp);
> +        }
> +
> +        vleft = vec_sra(vleft, vshift);
> +        vright = vec_sra(vright, vshift);
> +        v = vec_packsu(vleft, vright);
> +        v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
> +        v = vec_min(v, vlargest);
> +        v = vec_rl(v, vswap);
> +        v = vec_perm(v, v, vperm);
> +        vec_st(v, 0, &dest[i]);
> +    }
> +
> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
> +}
> +
> +
>  #undef output_pixel
>  
>  #define output_pixel(pos, val, bias, signedness) \
> @@ -234,7 +306,97 @@ static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
>      yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
>  }
>  

> +#ifdef __POWER8_VECTOR__

we generally test for build/cpu environment capabilities in configure
and then set stuff like HAVE_MMX / HAVE_MMX_EXTERNAL / ...
and where needed test again at runtime if the actual CPU the code runs
on supports the extension

I dont know if it can happen that code is built where this is supported and
then run where it is not

but this at least looks like it should be tested for by configure
this also allows the user to force them not to be built

[...]

> +#ifdef __GNUC__
> +            // GCC does not support vmuluwm yet. Bug open.
> +            __asm__("vmuluwm	%0, %1, %2" : "=v"(vtmp) : "v"(vin32l), "v"(vfilter[j]));
> +            vleft = vec_add(vleft, vtmp);
> +            __asm__("vmuluwm	%0, %1, %2" : "=v"(vtmp) : "v"(vin32r), "v"(vfilter[j]));

tabs are forbidden in .c files in ffmpeg git

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Frequently ignored answer#1 FFmpeg bugs should be sent to our bugtracker. User
questions about the command line tools should be sent to the ffmpeg-user ML.
And questions about how to use libav* should be sent to the libav-user ML.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20190107/eec1e6cc/attachment.sig>


More information about the ffmpeg-devel mailing list