[FFmpeg-devel] libavcodec/exr : add X86 64 SIMD for reorder pixels (SSE and AVX2) (v4)

James Almer jamrial at gmail.com
Sun Sep 17 22:19:44 EEST 2017


On 9/17/2017 3:22 PM, Martin Vignali wrote:
> From 338f96a7f3c0f97cfafc0deda2322695a4006b5a Mon Sep 17 00:00:00 2001
> From: Martin Vignali <martin.vignali at gmail.com>
> Date: Sun, 17 Sep 2017 20:05:16 +0200
> Subject: [PATCH] libavcodec/exr : add X86 64 SIMD for reorder_pixels
> 
> ---
>  libavcodec/Makefile          |  2 +-
>  libavcodec/exr.c             | 44 ++++++++++++++--------------
>  libavcodec/exrdsp.c          | 45 +++++++++++++++++++++++++++++
>  libavcodec/exrdsp.h          | 32 ++++++++++++++++++++
>  libavcodec/x86/Makefile      |  2 ++
>  libavcodec/x86/exrdsp.asm    | 69 ++++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/exrdsp_init.c | 43 +++++++++++++++++++++++++++
>  7 files changed, 213 insertions(+), 24 deletions(-)
>  create mode 100644 libavcodec/exrdsp.c
>  create mode 100644 libavcodec/exrdsp.h
>  create mode 100644 libavcodec/x86/exrdsp.asm
>  create mode 100644 libavcodec/x86/exrdsp_init.c
> 
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index 943e5db511..fad56129a3 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -286,7 +286,7 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
>  OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
>  OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
>  OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
> -OBJS-$(CONFIG_EXR_DECODER)             += exr.o
> +OBJS-$(CONFIG_EXR_DECODER)             += exr.o exrdsp.o
>  OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
>  OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
>  OBJS-$(CONFIG_FFWAVESYNTH_DECODER)     += ffwavesynth.o
> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
> index 759880756d..478c127ebe 100644
> --- a/libavcodec/exr.c
> +++ b/libavcodec/exr.c
> @@ -40,6 +40,7 @@
>  #include "libavutil/avassert.h"
>  #include "libavutil/common.h"
>  #include "libavutil/imgutils.h"
> +#include "libavutil/timer.h"

Not needed.

>  #include "libavutil/intfloat.h"
>  #include "libavutil/opt.h"
>  #include "libavutil/color_utils.h"
> @@ -55,6 +56,7 @@
>  #include "internal.h"
>  #include "mathops.h"
>  #include "thread.h"
> +#include "exrdsp.h"

Add this one above of get_bits.h, to keep the alphabetical order.

>  
>  enum ExrCompr {
>      EXR_RAW,
> @@ -121,6 +123,7 @@ typedef struct EXRContext {
>      AVClass *class;
>      AVFrame *picture;
>      AVCodecContext *avctx;
> +    ExrDSPContext dsp;
>  
>  #if HAVE_BIGENDIAN
>      BswapDSPContext bbdsp;
> @@ -275,23 +278,7 @@ static void predictor(uint8_t *src, int size)
>      }
>  }
>  
> -static void reorder_pixels(uint8_t *src, uint8_t *dst, int size)
> -{
> -    const uint8_t *t1 = src;
> -    int half_size     = size / 2;
> -    const uint8_t *t2 = src + half_size;
> -    uint8_t *s        = dst;
> -    int i;
> -
> -    av_assert1(size % 2 == 0);
> -
> -    for (i = 0; i < half_size; i++) {
> -        *(s++) = *(t1++);
> -        *(s++) = *(t2++);
> -    }
> -}
> -
> -static int zip_uncompress(const uint8_t *src, int compressed_size,
> +static int zip_uncompress(EXRContext *s, const uint8_t *src, int compressed_size,
>                            int uncompressed_size, EXRThreadData *td)
>  {
>      unsigned long dest_len = uncompressed_size;
> @@ -300,13 +287,18 @@ static int zip_uncompress(const uint8_t *src, int compressed_size,
>          dest_len != uncompressed_size)
>          return AVERROR_INVALIDDATA;
>  
> +    av_assert1(uncompressed_size % 2 == 0);
> +
>      predictor(td->tmp, uncompressed_size);
> -    reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
> +
> +    //START_TIMER;

Don't add dead benchmarking/debug code.

> +    s->dsp.reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
> +    //STOP_TIMER("reorder_pixels_zip");
>  
>      return 0;
>  }
>  
> -static int rle_uncompress(const uint8_t *src, int compressed_size,
> +static int rle_uncompress(EXRContext *ctx, const uint8_t *src, int compressed_size,
>                            int uncompressed_size, EXRThreadData *td)
>  {
>      uint8_t *d      = td->tmp;
> @@ -345,8 +337,10 @@ static int rle_uncompress(const uint8_t *src, int compressed_size,
>      if (dend != d)
>          return AVERROR_INVALIDDATA;
>  
> +    av_assert1(uncompressed_size % 2 == 0);
> +
>      predictor(td->tmp, uncompressed_size);
> -    reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
> +    ctx->dsp.reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
>  
>      return 0;
>  }
> @@ -954,6 +948,7 @@ static void unpack_14(const uint8_t b[14], uint16_t s[16])
>      }
>  }
>  
> +

Stray new line.

>  static void unpack_3(const uint8_t b[3], uint16_t s[16])
>  {
>      int i;
> @@ -1000,6 +995,7 @@ static int b44_uncompress(EXRContext *s, const uint8_t *src, int compressed_size
>  
>                      if (src[compressed_size - stay_to_uncompress + 2] == 0xfc) { /* B44A block */
>                          unpack_3(sr, tmp_buffer);
> +

Same.

>                          sr += 3;
>                          stay_to_uncompress -= 3;
>                      }  else {/* B44 Block */
> @@ -1152,7 +1148,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
>  
>      if (data_size < uncompressed_size) {
>          av_fast_padded_malloc(&td->uncompressed_data,
> -                              &td->uncompressed_size, uncompressed_size);
> +                              &td->uncompressed_size, uncompressed_size + 64);/* Force 64 padding for AVX2 reorder_pixels dst */
>  
>          if (!td->uncompressed_data)
>              return AVERROR(ENOMEM);
> @@ -1161,7 +1157,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
>          switch (s->compression) {
>          case EXR_ZIP1:
>          case EXR_ZIP16:
> -            ret = zip_uncompress(src, data_size, uncompressed_size, td);
> +            ret = zip_uncompress(s, src, data_size, uncompressed_size, td);
>              break;
>          case EXR_PIZ:
>              ret = piz_uncompress(s, src, data_size, uncompressed_size, td);
> @@ -1170,7 +1166,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
>              ret = pxr24_uncompress(s, src, data_size, uncompressed_size, td);
>              break;
>          case EXR_RLE:
> -            ret = rle_uncompress(src, data_size, uncompressed_size, td);
> +            ret = rle_uncompress(s, src, data_size, uncompressed_size, td);
>              break;
>          case EXR_B44:
>          case EXR_B44A:
> @@ -1804,6 +1800,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  
>      s->avctx              = avctx;
>  
> +    ff_exrdsp_init(&s->dsp);
> +
>  #if HAVE_BIGENDIAN
>      ff_bswapdsp_init(&s->bbdsp);
>  #endif
> diff --git a/libavcodec/exrdsp.c b/libavcodec/exrdsp.c
> new file mode 100644
> index 0000000000..af47a6f8df
> --- /dev/null
> +++ b/libavcodec/exrdsp.c
> @@ -0,0 +1,45 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "exrdsp.h"
> +#include "config.h"
> +
> +static void reorder_pixels_scalar(uint8_t *src, uint8_t *dst, ptrdiff_t size)
> +{
> +    const uint8_t *t1 = src;
> +    int half_size     = size / 2;
> +    const uint8_t *t2 = src + half_size;
> +    uint8_t *s        = dst;
> +    int i;
> +
> +    for (i = 0; i < half_size; i++) {
> +        *(s++) = *(t1++);
> +        *(s++) = *(t2++);
> +    }
> +}
> +
> +av_cold void ff_exrdsp_init(ExrDSPContext *c)
> +{
> +    c->reorder_pixels   = reorder_pixels_scalar;
> +
> +    if (ARCH_X86)
> +        ff_exrdsp_init_x86(c);
> +}
> diff --git a/libavcodec/exrdsp.h b/libavcodec/exrdsp.h
> new file mode 100644
> index 0000000000..09a76a518e
> --- /dev/null
> +++ b/libavcodec/exrdsp.h
> @@ -0,0 +1,32 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_EXRDSP_H
> +#define AVCODEC_EXRDSP_H
> +
> +#include <stdint.h>
> +#include "libavutil/common.h"
> +
> +typedef struct ExrDSPContext {
> +    void (*reorder_pixels)(uint8_t *src, uint8_t *dst, ptrdiff_t size);
> +} ExrDSPContext;
> +
> +void ff_exrdsp_init(ExrDSPContext *c);
> +void ff_exrdsp_init_x86(ExrDSPContext *c);
> +
> +#endif /* AVCODEC_EXRDSP_H */
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index e36644c72a..a805cd37b4 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -52,6 +52,7 @@ OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
>  OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
>  OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
>  OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
> +OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
>  OBJS-$(CONFIG_OPUS_DECODER)            += x86/opus_dsp_init.o
>  OBJS-$(CONFIG_OPUS_ENCODER)            += x86/opus_dsp_init.o
>  OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
> @@ -153,6 +154,7 @@ X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
>  X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
>                                            x86/dirac_dwt.o
>  X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)    += x86/dnxhdenc.o
> +X86ASM-OBJS-$(CONFIG_EXR_DECODER)      += x86/exrdsp.o
>  X86ASM-OBJS-$(CONFIG_FLAC_DECODER)     += x86/flacdsp.o
>  ifdef CONFIG_GPL
>  X86ASM-OBJS-$(CONFIG_FLAC_ENCODER)     += x86/flac_dsp_gpl.o
> diff --git a/libavcodec/x86/exrdsp.asm b/libavcodec/x86/exrdsp.asm
> new file mode 100644
> index 0000000000..f609c055b0
> --- /dev/null
> +++ b/libavcodec/x86/exrdsp.asm
> @@ -0,0 +1,69 @@
> +;******************************************************************************
> +;* X86 Optimized functions for Open Exr Decoder
> +;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
> +;*
> +;* reorder_pixels based on patch by John Loy
> +;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +;------------------------------------------------------------------------------
> +; void ff_reorder_pixels(uint8_t *src, uint8_t *dst, ptrdiff_t size)
> +;------------------------------------------------------------------------------
> +
> +%macro REORDER_PIXELS 0
> +cglobal reorder_pixels, 3,4,3, src1, dst, size, src2
> +    lea                              src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
> +    add                               dstq, sizeq         ; dst offset by size
> +    shr                              sizeq, 1             ; half_size
> +    add                              src1q, sizeq         ; offset src by half_size
> +    neg                              sizeq                ; size = offset for dst, src1, src2
> +.loop:
> +
> +%if cpuflag(avx2)
> +    vpermq                              m0, [src1q + sizeq], 0xd8; load first part
> +    vpermq                              m1, [src2q + sizeq], 0xd8; load second part
> +
> +    vpunpcklbw                          m2,  m0, m1              ; interleaved part 1
> +    vmovdqa               [dstq + sizeq*2], m2                   ; copy to dst
> +
> +    vpunpckhbw                          m0, m0, m1               ; interleaved part 2
> +    vmovdqa      [dstq + sizeq*2 + mmsize], m0                   ; copy to dst
> +%else
> +    mova                                m0, [src1q+sizeq]        ; load first part
> +    movu                                m1, [src2q+sizeq]        ; load second part
> +    SBUTTERFLY bw, 0, 1, 2                                       ; interleaved
> +    mova                 [dstq+2*sizeq   ], m0                   ; copy to dst
> +    mova             [dstq+2*sizeq+mmsize], m1
> +%endif
> +    add     sizeq, mmsize
> +    jl .loop
> +    RET

You can reuse the SBUTTERFLY + 2 store mova in the avx2 version as well.
The resulting assembly is essentially the same, and it will look much
cleaner here.

%if cpuflag(avx2)
    vpermq                  m0, [src1q+sizeq], 0xd8 ; load first part
    vpermq                  m1, [src2q+sizeq], 0xd8 ; load second part
%else
    mova                    m0, [src1q+sizeq]       ; load first part
    movu                    m1, [src2q+sizeq]       ; load second part
%endif
    SBUTTERFLY bw, 0, 1, 2                          ; interleaved
    mova     [dstq+2*sizeq   ], m0                  ; copy to dst
    mova [dstq+2*sizeq+mmsize], m1

> +%endmacro
> +
> +INIT_XMM sse2
> +REORDER_PIXELS
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +REORDER_PIXELS
> +%endif
> diff --git a/libavcodec/x86/exrdsp_init.c b/libavcodec/x86/exrdsp_init.c
> new file mode 100644
> index 0000000000..49fd00e640
> --- /dev/null
> +++ b/libavcodec/x86/exrdsp_init.c
> @@ -0,0 +1,43 @@
> +/*
> + * OpenEXR (.exr) image decoder
> + *
> + * Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/exrdsp.h"
> +
> +void ff_reorder_pixels_sse2(uint8_t *src, uint8_t *dst, ptrdiff_t size);
> +
> +void ff_reorder_pixels_avx2(uint8_t *src, uint8_t *dst, ptrdiff_t size);
> +
> +av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
> +{
> +#if ARCH_X86_64

The functions are being assembled on x86_32, and they should work just
fine with such targets. So why limit the initialization to x86_64 only here?

> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_SSE2(cpu_flags)) {
> +        dsp->reorder_pixels = ff_reorder_pixels_sse2;
> +    }
> +    if (EXTERNAL_AVX2(cpu_flags)) {

EXTERNAL_AVX2_FAST(cpu_flags)

The AVX2 function uses YMM registers, meaning it will be slow on certain
AMD CPUs. The _FAST version of the macro makes sure it will not be used
with those.

> +        dsp->reorder_pixels = ff_reorder_pixels_avx2;
> +    }
> +#endif /* ARCH_X86_64 */
> +}
> -- 
> 2.11.0 (Apple Git-81)
> 

fate-exr passes on mingw-w64 as well.


More information about the ffmpeg-devel mailing list