[FFmpeg-devel] [PATCH] vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions.

Matt Oliver protogonoi at gmail.com
Mon Oct 5 10:55:56 CEST 2015


On 3 October 2015 at 21:05, Ronald S. Bultje <rsbultje at gmail.com> wrote:

> ---
>  libavcodec/x86/Makefile                     |   1 +
>  libavcodec/x86/constants.c                  |   4 +
>  libavcodec/x86/constants.h                  |   2 +
>  libavcodec/x86/h264_idct_10bit.asm          |   5 +-
>  libavcodec/x86/h264_intrapred_10bit.asm     |   2 +-
>  libavcodec/x86/vp9dsp_init.h                |  23 ++
>  libavcodec/x86/vp9dsp_init_16bpp.c          |  15 +
>  libavcodec/x86/vp9dsp_init_16bpp_template.c |   7 +
>  libavcodec/x86/vp9intrapred_16bpp.asm       | 615
> ++++++++++++++++++++++++++++
>  9 files changed, 669 insertions(+), 5 deletions(-)
>  create mode 100644 libavcodec/x86/vp9intrapred_16bpp.asm
>
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 01e5f18..5ff3a77 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -158,6 +158,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
>  YASM-OBJS-$(CONFIG_VORBIS_DECODER)     += x86/vorbisdsp.o
>  YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp6dsp.o
>  YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9intrapred.o            \
> +                                          x86/vp9intrapred_16bpp.o      \
>                                            x86/vp9itxfm.o                \
>                                            x86/vp9lpf.o                  \
>                                            x86/vp9lpf_16bpp.o            \
> diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
> index 9f3c8b4..19345f5 100644
> --- a/libavcodec/x86/constants.c
> +++ b/libavcodec/x86/constants.c
> @@ -81,3 +81,7 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_ps_neg)  = {
> 0x8000000080000000ULL, 0x800
>
>  DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_1)    = {
> 0x0000000100000001ULL, 0x0000000100000001ULL,
>
>  0x0000000100000001ULL, 0x0000000100000001ULL };
> +DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_16)   = {
> 0x0000001000000010ULL, 0x0000001000000010ULL,
> +
> 0x0000001000000010ULL, 0x0000001000000010ULL };
> +DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_32)   = {
> 0x0000002000000020ULL, 0x0000002000000020ULL,
> +
> 0x0000002000000020ULL, 0x0000002000000020ULL };
> diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
> index 37a1869..4a2451d 100644
> --- a/libavcodec/x86/constants.h
> +++ b/libavcodec/x86/constants.h
> @@ -63,5 +63,7 @@ extern const uint64_t ff_pb_FC;
>  extern const xmm_reg  ff_ps_neg;
>
>  extern const ymm_reg  ff_pd_1;
> +extern const ymm_reg  ff_pd_16;
> +extern const ymm_reg  ff_pd_32;
>
>  #endif /* AVCODEC_X86_CONSTANTS_H */
> diff --git a/libavcodec/x86/h264_idct_10bit.asm
> b/libavcodec/x86/h264_idct_10bit.asm
> index cc115b0..f1c2c81 100644
> --- a/libavcodec/x86/h264_idct_10bit.asm
> +++ b/libavcodec/x86/h264_idct_10bit.asm
> @@ -24,14 +24,11 @@
>
>  %include "libavutil/x86/x86util.asm"
>
> -SECTION_RODATA
> -
> -pd_32:        times 4 dd 32
> -
>  SECTION .text
>
>  cextern pw_1023
>  %define pw_pixel_max pw_1023
> +cextern pd_32
>
>
>  ;-----------------------------------------------------------------------------
>  ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
> diff --git a/libavcodec/x86/h264_intrapred_10bit.asm
> b/libavcodec/x86/h264_intrapred_10bit.asm
> index 9aeb702..9e40cfe 100644
> --- a/libavcodec/x86/h264_intrapred_10bit.asm
> +++ b/libavcodec/x86/h264_intrapred_10bit.asm
> @@ -34,11 +34,11 @@ cextern pw_8
>  cextern pw_4
>  cextern pw_2
>  cextern pw_1
> +cextern pd_16
>
>  pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
>  pw_m3:        times 8 dw -3
>  pd_17:        times 4 dd 17
> -pd_16:        times 4 dd 16
>
>  SECTION .text
>
> diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
> index d1a9514..47d2246 100644
> --- a/libavcodec/x86/vp9dsp_init.h
> +++ b/libavcodec/x86/vp9dsp_init.h
> @@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
>  decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
>  decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
>
> +#define decl_ipred_fn(type, sz, bpp, opt) \
> +void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
> +                                                       ptrdiff_t stride, \
> +                                                       const uint8_t *l, \
> +                                                       const uint8_t *a)
> +
> +#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
> +decl_ipred_fn(type,  4, bpp, opt4); \
> +decl_ipred_fn(type,  8, bpp, opt8_16_32); \
> +decl_ipred_fn(type, 16, bpp, opt8_16_32); \
> +decl_ipred_fn(type, 32, bpp, opt8_16_32)
> +
>  #define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
>  static av_always_inline void \
>  ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst,
> ptrdiff_t dst_stride, \
> @@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4,
> f_opt)
>      init_subpel3_8to64(idx, type, bpp, opt); \
>      init_subpel2(4, idx,  4, type, bpp, opt)
>
> +#define cat(a, bpp, b) a##bpp##b
> +
> +#define init_ipred_func(type, enum, sz, bpp, opt) \
> +    dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
> +        cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
> +
> +#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
> +    init_ipred_func(type, enum,  8, bpp, opt); \
> +    init_ipred_func(type, enum, 16, bpp, opt); \
> +    init_ipred_func(type, enum, 32, bpp, opt)
> +
>  void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
>  void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
>  void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c
> b/libavcodec/x86/vp9dsp_init_16bpp.c
> index bd61e24..f4a4a5d 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> @@ -46,6 +46,11 @@ decl_fpel_func(avg,  32, _16, avx2);
>  decl_fpel_func(avg,  64, _16, avx2);
>  decl_fpel_func(avg, 128, _16, avx2);
>
> +decl_ipred_fns(v,       16, mmx,    sse);
> +decl_ipred_fns(h,       16, mmxext, sse2);
> +decl_ipred_fns(dc,      16, mmxext, sse2);
> +decl_ipred_fns(dc_top,  16, mmxext, sse2);
> +decl_ipred_fns(dc_left, 16, mmxext, sse2);
>  #endif /* HAVE_YASM */
>
>  av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
> @@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext
> *dsp)
>
>      if (EXTERNAL_MMX(cpu_flags)) {
>          init_fpel_func(4, 0,   8, put, , mmx);
> +        init_ipred_func(v, VERT, 4, 16, mmx);
>      }
>
>      if (EXTERNAL_MMXEXT(cpu_flags)) {
>          init_fpel_func(4, 1,   8, avg, _16, mmxext);
> +        init_ipred_func(h, HOR, 4, 16, mmxext);
> +        init_ipred_func(dc, DC, 4, 16, mmxext);
> +        init_ipred_func(dc_top,  TOP_DC,  4, 16, mmxext);
> +        init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
>      }
>
>      if (EXTERNAL_SSE(cpu_flags)) {
> @@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
>          init_fpel_func(2, 0,  32, put, , sse);
>          init_fpel_func(1, 0,  64, put, , sse);
>          init_fpel_func(0, 0, 128, put, , sse);
> +        init_8_16_32_ipred_funcs(v, VERT, 16, sse);
>      }
>
>      if (EXTERNAL_SSE2(cpu_flags)) {
> @@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext
> *dsp)
>          init_fpel_func(2, 1,  32, avg, _16, sse2);
>          init_fpel_func(1, 1,  64, avg, _16, sse2);
>          init_fpel_func(0, 1, 128, avg, _16, sse2);
> +        init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
> +        init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
> +        init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
> +        init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
>      }
>
>      if (EXTERNAL_AVX_FAST(cpu_flags)) {
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c
> b/libavcodec/x86/vp9dsp_init_16bpp_template.c
> index 56cd79e..f486caf 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
> @@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \
>  lpf_mix2_wrappers_set(BPC, sse2);
>  lpf_mix2_wrappers_set(BPC, ssse3);
>  lpf_mix2_wrappers_set(BPC, avx);
> +
> +decl_ipred_fns(tm, BPC, mmxext, sse2);
>  #endif /* HAVE_YASM */
>
>  av_cold void INIT_FUNC(VP9DSPContext *dsp)
> @@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
>      init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
>      init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
>
> +    if (EXTERNAL_MMXEXT(cpu_flags)) {
> +        init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
> +    }
> +
>      if (EXTERNAL_SSE2(cpu_flags)) {
>          init_subpel3(0, put, BPC, sse2);
>          init_subpel3(1, avg, BPC, sse2);
>          init_lpf_funcs(BPC, sse2);
> +        init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
>      }
>
>      if (EXTERNAL_SSSE3(cpu_flags)) {
> diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm
> b/libavcodec/x86/vp9intrapred_16bpp.asm
> new file mode 100644
> index 0000000..018d92d
> --- /dev/null
> +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> @@ -0,0 +1,615 @@
>
> +;******************************************************************************
> +;* VP9 Intra prediction SIMD optimizations
> +;*
> +;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
> +;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
>
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA 32
> +
> +pd_2: times 8 dd 2
> +pd_4: times 8 dd 4
> +pd_8: times 8 dd 8
> +
> +cextern pw_1
> +cextern pw_1023
> +cextern pw_4095
> +cextern pd_16
> +cextern pd_32
> +
> +SECTION .text
> +
> +INIT_MMX mmx
> +cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
> +    movifnidn               aq, amp
> +    mova                    m0, [aq]
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    RET
> +
> +INIT_XMM sse
> +cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
> +    movifnidn               aq, amp
> +    mova                    m0, [aq]
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    lea                   dstq, [dstq+strideq*4]
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    RET
> +
> +INIT_XMM sse
> +cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
> +    movifnidn               aq, amp
> +    mova                    m0, [aq]
> +    mova                    m1, [aq+mmsize]
> +    DEFINE_ARGS dst, stride, stride3, cnt
> +    lea               stride3q, [strideq*3]
> +    mov                   cntd, 4
> +.loop:
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m1
> +    mova   [dstq+strideq*1+ 0], m0
> +    mova   [dstq+strideq*1+16], m1
> +    mova   [dstq+strideq*2+ 0], m0
> +    mova   [dstq+strideq*2+16], m1
> +    mova   [dstq+stride3q + 0], m0
> +    mova   [dstq+stride3q +16], m1
> +    lea                   dstq, [dstq+strideq*4]
> +    dec               cntd
> +    jg .loop
> +    RET
> +
> +INIT_XMM sse
> +cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
> +    movifnidn               aq, amp
> +    mova                    m0, [aq+mmsize*0]
> +    mova                    m1, [aq+mmsize*1]
> +    mova                    m2, [aq+mmsize*2]
> +    mova                    m3, [aq+mmsize*3]
> +    DEFINE_ARGS dst, stride, cnt
> +    mov                   cntd, 16
> +.loop:
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m1
> +    mova   [dstq+strideq*0+32], m2
> +    mova   [dstq+strideq*0+48], m3
> +    mova   [dstq+strideq*1+ 0], m0
> +    mova   [dstq+strideq*1+16], m1
> +    mova   [dstq+strideq*1+32], m2
> +    mova   [dstq+strideq*1+48], m3
> +    lea                   dstq, [dstq+strideq*2]
> +    dec               cntd
> +    jg .loop
> +    RET
> +
> +INIT_MMX mmxext
> +cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
> +    mova                    m3, [lq]
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    pshufw                  m0, m3, q3333
> +    pshufw                  m1, m3, q2222
> +    pshufw                  m2, m3, q1111
> +    pshufw                  m3, m3, q0000
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m1
> +    mova      [dstq+strideq*2], m2
> +    mova      [dstq+stride3q ], m3
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
> +    mova                    m2, [lq]
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    punpckhwd               m3, m2, m2
> +    pshufd                  m0, m3, q3333
> +    pshufd                  m1, m3, q2222
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m1
> +    pshufd                  m0, m3, q1111
> +    pshufd                  m1, m3, q0000
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m1
> +    lea                   dstq, [dstq+strideq*4]
> +    punpcklwd               m2, m2
> +    pshufd                  m0, m2, q3333
> +    pshufd                  m1, m2, q2222
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m1
> +    pshufd                  m0, m2, q1111
> +    pshufd                  m1, m2, q0000
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m1
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
> +    mov                   cntd, 3
> +    lea               stride3q, [strideq*3]
> +.loop:
> +    movh                    m3, [lq+cntq*8]
> +    punpcklwd               m3, m3
> +    pshufd                  m0, m3, q3333
> +    pshufd                  m1, m3, q2222
> +    pshufd                  m2, m3, q1111
> +    pshufd                  m3, m3, q0000
> +    mova    [dstq+strideq*0+ 0], m0
> +    mova    [dstq+strideq*0+16], m0
> +    mova    [dstq+strideq*1+ 0], m1
> +    mova    [dstq+strideq*1+16], m1
> +    mova    [dstq+strideq*2+ 0], m2
> +    mova    [dstq+strideq*2+16], m2
> +    mova    [dstq+stride3q + 0], m3
> +    mova    [dstq+stride3q +16], m3
> +    lea                   dstq, [dstq+strideq*4]
> +    dec                   cntd
> +    jge .loop
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
> +    mov                   cntd, 7
> +    lea               stride3q, [strideq*3]
> +.loop:
> +    movh                    m3, [lq+cntq*8]
> +    punpcklwd               m3, m3
> +    pshufd                  m0, m3, q3333
> +    pshufd                  m1, m3, q2222
> +    pshufd                  m2, m3, q1111
> +    pshufd                  m3, m3, q0000
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m0
> +    mova   [dstq+strideq*0+32], m0
> +    mova   [dstq+strideq*0+48], m0
> +    mova   [dstq+strideq*1+ 0], m1
> +    mova   [dstq+strideq*1+16], m1
> +    mova   [dstq+strideq*1+32], m1
> +    mova   [dstq+strideq*1+48], m1
> +    mova   [dstq+strideq*2+ 0], m2
> +    mova   [dstq+strideq*2+16], m2
> +    mova   [dstq+strideq*2+32], m2
> +    mova   [dstq+strideq*2+48], m2
> +    mova   [dstq+stride3q + 0], m3
> +    mova   [dstq+stride3q +16], m3
> +    mova   [dstq+stride3q +32], m3
> +    mova   [dstq+stride3q +48], m3
> +    lea                   dstq, [dstq+strideq*4]
> +    dec                   cntd
> +    jge .loop
> +    RET
> +
> +INIT_MMX mmxext
> +cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
> +    mova                    m0, [lq]
> +    paddw                   m0, [aq]
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    pmaddwd                 m0, [pw_1]
> +    pshufw                  m1, m0, q3232
> +    paddd                   m0, [pd_4]
> +    paddd                   m0, m1
> +    psrad                   m0, 3
> +    pshufw                  m0, m0, q0000
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
> +    mova                    m0, [lq]
> +    paddw                   m0, [aq]
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    pmaddwd                 m0, [pw_1]
> +    pshufd                  m1, m0, q3232
> +    paddd                   m0, m1
> +    pshufd                  m1, m0, q1111
> +    paddd                   m0, [pd_8]
> +    paddd                   m0, m1
> +    psrad                   m0, 4
> +    pshuflw                 m0, m0, q0000
> +    punpcklqdq              m0, m0
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    lea                   dstq, [dstq+strideq*4]
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
> +    mova                    m0, [lq]
> +    paddw                   m0, [lq+mmsize]
> +    paddw                   m0, [aq]
> +    paddw                   m0, [aq+mmsize]
> +    DEFINE_ARGS dst, stride, stride3, cnt
> +    lea               stride3q, [strideq*3]
> +    mov                   cntd, 4
> +    pmaddwd                 m0, [pw_1]
> +    pshufd                  m1, m0, q3232
> +    paddd                   m0, m1
> +    pshufd                  m1, m0, q1111
> +    paddd                   m0, [pd_16]
> +    paddd                   m0, m1
> +    psrad                   m0, 5
> +    pshuflw                 m0, m0, q0000
> +    punpcklqdq              m0, m0
> +.loop:
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m0
> +    mova   [dstq+strideq*1+ 0], m0
> +    mova   [dstq+strideq*1+16], m0
> +    mova   [dstq+strideq*2+ 0], m0
> +    mova   [dstq+strideq*2+16], m0
> +    mova   [dstq+stride3q + 0], m0
> +    mova   [dstq+stride3q +16], m0
> +    lea                   dstq, [dstq+strideq*4]
> +    dec                   cntd
> +    jg .loop
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
> +    mova                    m0, [lq+mmsize*0]
> +    paddw                   m0, [lq+mmsize*1]
> +    paddw                   m0, [lq+mmsize*2]
> +    paddw                   m0, [lq+mmsize*3]
> +    paddw                   m0, [aq+mmsize*0]
> +    paddw                   m0, [aq+mmsize*1]
> +    paddw                   m0, [aq+mmsize*2]
> +    paddw                   m0, [aq+mmsize*3]
> +    DEFINE_ARGS dst, stride, stride3, cnt
> +    lea               stride3q, [strideq*3]
> +    mov                   cntd, 16
> +    pmaddwd                 m0, [pw_1]
> +    pshufd                  m1, m0, q3232
> +    paddd                   m0, m1
> +    pshufd                  m1, m0, q1111
> +    paddd                   m0, [pd_32]
> +    paddd                   m0, m1
> +    psrad                   m0, 6
> +    pshuflw                 m0, m0, q0000
> +    punpcklqdq              m0, m0
> +.loop:
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m0
> +    mova   [dstq+strideq*0+32], m0
> +    mova   [dstq+strideq*0+48], m0
> +    mova   [dstq+strideq*1+ 0], m0
> +    mova   [dstq+strideq*1+16], m0
> +    mova   [dstq+strideq*1+32], m0
> +    mova   [dstq+strideq*1+48], m0
> +    lea                   dstq, [dstq+strideq*2]
> +    dec                   cntd
> +    jg .loop
> +    RET
> +
> +%macro DC_1D_FNS 2
> +INIT_MMX mmxext
> +cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
> +    mova                    m0, [%2]
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    pmaddwd                 m0, [pw_1]
> +    pshufw                  m1, m0, q3232
> +    paddd                   m0, [pd_2]
> +    paddd                   m0, m1
> +    psrad                   m0, 2
> +    pshufw                  m0, m0, q0000
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
> +    mova                    m0, [%2]
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    pmaddwd                 m0, [pw_1]
> +    pshufd                  m1, m0, q3232
> +    paddd                   m0, m1
> +    pshufd                  m1, m0, q1111
> +    paddd                   m0, [pd_4]
> +    paddd                   m0, m1
> +    psrad                   m0, 3
> +    pshuflw                 m0, m0, q0000
> +    punpcklqdq              m0, m0
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    lea                   dstq, [dstq+strideq*4]
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m0
> +    mova      [dstq+strideq*2], m0
> +    mova      [dstq+stride3q ], m0
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
> +    mova                    m0, [%2]
> +    paddw                   m0, [%2+mmsize]
> +    DEFINE_ARGS dst, stride, stride3, cnt
> +    lea               stride3q, [strideq*3]
> +    mov                   cntd, 4
> +    pmaddwd                 m0, [pw_1]
> +    pshufd                  m1, m0, q3232
> +    paddd                   m0, m1
> +    pshufd                  m1, m0, q1111
> +    paddd                   m0, [pd_8]
> +    paddd                   m0, m1
> +    psrad                   m0, 4
> +    pshuflw                 m0, m0, q0000
> +    punpcklqdq              m0, m0
> +.loop:
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m0
> +    mova   [dstq+strideq*1+ 0], m0
> +    mova   [dstq+strideq*1+16], m0
> +    mova   [dstq+strideq*2+ 0], m0
> +    mova   [dstq+strideq*2+16], m0
> +    mova   [dstq+stride3q + 0], m0
> +    mova   [dstq+stride3q +16], m0
> +    lea                   dstq, [dstq+strideq*4]
> +    dec                   cntd
> +    jg .loop
> +    RET
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
> +    mova                    m0, [%2+mmsize*0]
> +    paddw                   m0, [%2+mmsize*1]
> +    paddw                   m0, [%2+mmsize*2]
> +    paddw                   m0, [%2+mmsize*3]
> +    DEFINE_ARGS dst, stride, cnt
> +    mov                   cntd, 16
> +    pmaddwd                 m0, [pw_1]
> +    pshufd                  m1, m0, q3232
> +    paddd                   m0, m1
> +    pshufd                  m1, m0, q1111
> +    paddd                   m0, [pd_16]
> +    paddd                   m0, m1
> +    psrad                   m0, 5
> +    pshuflw                 m0, m0, q0000
> +    punpcklqdq              m0, m0
> +.loop:
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m0
> +    mova   [dstq+strideq*0+32], m0
> +    mova   [dstq+strideq*0+48], m0
> +    mova   [dstq+strideq*1+ 0], m0
> +    mova   [dstq+strideq*1+16], m0
> +    mova   [dstq+strideq*1+32], m0
> +    mova   [dstq+strideq*1+48], m0
> +    lea                   dstq, [dstq+strideq*2]
> +    dec                   cntd
> +    jg .loop
> +    RET
> +%endmacro
> +
> +DC_1D_FNS top,  aq
> +DC_1D_FNS left, lq
> +
> +INIT_MMX mmxext
> +cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
> +    mova                    m5, [pw_1023]
> +.body:
> +    mova                    m4, [aq]
> +    mova                    m3, [lq]
> +    movd                    m0, [aq-4]
> +    pshufw                  m0, m0, q1111
> +    psubw                   m4, m0
> +    DEFINE_ARGS dst, stride, stride3
> +    lea               stride3q, [strideq*3]
> +    pshufw                  m0, m3, q3333
> +    pshufw                  m1, m3, q2222
> +    pshufw                  m2, m3, q1111
> +    pshufw                  m3, m3, q0000
> +    paddw                   m0, m4
> +    paddw                   m1, m4
> +    paddw                   m2, m4
> +    paddw                   m3, m4
> +    pxor                    m4, m4
> +    pmaxsw                  m0, m4
> +    pmaxsw                  m1, m4
> +    pmaxsw                  m2, m4
> +    pmaxsw                  m3, m4
> +    pminsw                  m0, m5
> +    pminsw                  m1, m5
> +    pminsw                  m2, m5
> +    pminsw                  m3, m5
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m1
> +    mova      [dstq+strideq*2], m2
> +    mova      [dstq+stride3q ], m3
> +    RET
> +
> +cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
> +    mova                    m5, [pw_4095]
> +    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
> +    mova                    m4, [pw_1023]
> +.body:
> +    pxor                    m6, m6
> +    mova                    m5, [aq]
> +    movd                    m0, [aq-4]
> +    pshuflw                 m0, m0, q1111
> +    punpcklqdq              m0, m0
> +    psubw                   m5, m0
> +    DEFINE_ARGS dst, stride, l, stride3, cnt
> +    lea               stride3q, [strideq*3]
> +    mov                   cntd, 1
> +.loop:
> +    movh                    m3, [lq+cntq*8]
> +    punpcklwd               m3, m3
> +    pshufd                  m0, m3, q3333
> +    pshufd                  m1, m3, q2222
> +    pshufd                  m2, m3, q1111
> +    pshufd                  m3, m3, q0000
> +    paddw                   m0, m5
> +    paddw                   m1, m5
> +    paddw                   m2, m5
> +    paddw                   m3, m5
> +    pmaxsw                  m0, m6
> +    pmaxsw                  m1, m6
> +    pmaxsw                  m2, m6
> +    pmaxsw                  m3, m6
> +    pminsw                  m0, m4
> +    pminsw                  m1, m4
> +    pminsw                  m2, m4
> +    pminsw                  m3, m4
> +    mova      [dstq+strideq*0], m0
> +    mova      [dstq+strideq*1], m1
> +    mova      [dstq+strideq*2], m2
> +    mova      [dstq+stride3q ], m3
> +    lea                   dstq, [dstq+strideq*4]
> +    dec                   cntd
> +    jge .loop
> +    RET
> +
> +cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
> +    mova                    m4, [pw_4095]
> +    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
> +    mova                    m7, [pw_1023]
> +.body:
> +    pxor                    m6, m6
> +    mova                    m4, [aq]
> +    mova                    m5, [aq+mmsize]
> +    movd                    m0, [aq-4]
> +    pshuflw                 m0, m0, q1111
> +    punpcklqdq              m0, m0
> +    psubw                   m4, m0
> +    psubw                   m5, m0
> +    DEFINE_ARGS dst, stride, l, cnt
> +    mov                   cntd, 7
> +.loop:
> +    movd                    m3, [lq+cntq*4]
> +    punpcklwd               m3, m3
> +    pshufd                  m2, m3, q1111
> +    pshufd                  m3, m3, q0000
> +    paddw                   m0, m2, m4
> +    paddw                   m2, m5
> +    paddw                   m1, m3, m4
> +    paddw                   m3, m5
> +    pmaxsw                  m0, m6
> +    pmaxsw                  m2, m6
> +    pmaxsw                  m1, m6
> +    pmaxsw                  m3, m6
> +    pminsw                  m0, m7
> +    pminsw                  m2, m7
> +    pminsw                  m1, m7
> +    pminsw                  m3, m7
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m2
> +    mova   [dstq+strideq*1+ 0], m1
> +    mova   [dstq+strideq*1+16], m3
> +    lea                   dstq, [dstq+strideq*2]
> +    dec                   cntd
> +    jge .loop
> +    RET
> +
> +cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
> +    mova                    m7, [pw_4095]
> +    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+
> SUFFIX).body
> +
> +INIT_XMM sse2
> +cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride,
> l, a
> +    mova                    m0, [pw_1023]
> +.body:
> +    pxor                    m1, m1
> +%if ARCH_X86_64
> +    SWAP                     0, 8
> +    SWAP                     1, 9
> +%define reg_min m9
> +%define reg_max m8
> +%else
> +    mova              [rsp+ 0], m0
> +    mova              [rsp+16], m1
> +%define reg_min [rsp+16]
> +%define reg_max [rsp+ 0]
> +%endif
> +
> +    mova                    m4, [aq+mmsize*0]
> +    mova                    m5, [aq+mmsize*1]
> +    mova                    m6, [aq+mmsize*2]
> +    mova                    m7, [aq+mmsize*3]
> +    movd                    m0, [aq-4]
> +    pshuflw                 m0, m0, q1111
> +    punpcklqdq              m0, m0
> +    psubw                   m4, m0
> +    psubw                   m5, m0
> +    psubw                   m6, m0
> +    psubw                   m7, m0
> +    DEFINE_ARGS dst, stride, l, cnt
> +    mov                   cntd, 31
> +.loop:
> +    pinsrw                  m3, [lq+cntq*2], 0
> +    punpcklwd               m3, m3
> +    pshufd                  m3, m3, q0000
> +    paddw                   m0, m3, m4
> +    paddw                   m1, m3, m5
> +    paddw                   m2, m3, m6
> +    paddw                   m3, m7
> +    pmaxsw                  m0, reg_min
> +    pmaxsw                  m1, reg_min
> +    pmaxsw                  m2, reg_min
> +    pmaxsw                  m3, reg_min
> +    pminsw                  m0, reg_max
> +    pminsw                  m1, reg_max
> +    pminsw                  m2, reg_max
> +    pminsw                  m3, reg_max
> +    mova   [dstq+strideq*0+ 0], m0
> +    mova   [dstq+strideq*0+16], m1
> +    mova   [dstq+strideq*0+32], m2
> +    mova   [dstq+strideq*0+48], m3
> +    add                   dstq, strideq
> +    dec                   cntd
> +    jge .loop
> +    RET
> +
> +cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * ARCH_X86_32, dst, stride,
> l, a
> +    mova                    m0, [pw_4095]
> +    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+
> SUFFIX).body
> --
> 2.1.2
>

This patch has broken the 32bit msvc builds:
http://fate.ffmpeg.org/report.cgi?time=20151005065109&slot=x86_32-msvc12-windows-native

I had a look through the code but couldnt find the cause. The error message
just points to the code line for a macro instantiation and I'm not familiar
enough with the code to be able to work out where within the macro the
error is actually occurring and why. So ill leave this one to someone more
familiar with the code.


More information about the ffmpeg-devel mailing list