[FFmpeg-devel] [PATCH] vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions.

Ronald S. Bultje rsbultje at gmail.com
Fri Sep 25 22:36:48 CEST 2015


---
 libavcodec/x86/Makefile                     |   1 +
 libavcodec/x86/constants.c                  |   2 +
 libavcodec/x86/constants.h                  |   1 +
 libavcodec/x86/vp9dsp_init.h                |  23 +
 libavcodec/x86/vp9dsp_init_16bpp.c          |  15 +
 libavcodec/x86/vp9dsp_init_16bpp_template.c |   7 +
 libavcodec/x86/vp9intrapred_16bpp.asm       | 672 ++++++++++++++++++++++++++++
 libavcodec/x86/vp9lpf_16bpp.asm             |   2 +-
 libavcodec/x86/vp9mc_16bpp.asm              |   2 +-
 9 files changed, 723 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/x86/vp9intrapred_16bpp.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 9cddaac..44bf342 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -157,6 +157,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
 YASM-OBJS-$(CONFIG_VORBIS_DECODER)     += x86/vorbisdsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp6dsp.o
 YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9intrapred.o            \
+                                          x86/vp9intrapred_16bpp.o      \
                                           x86/vp9itxfm.o                \
                                           x86/vp9lpf.o                  \
                                           x86/vp9lpf_16bpp.o            \
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 553dd49..9f3c8b4 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -55,6 +55,8 @@ DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1024) = { 0x0400040004000400ULL, 0x040
                                                     0x0400040004000400ULL, 0x0400040004000400ULL};
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
                                                     0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+                                                    0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
                                                     0x1000100010001000ULL, 0x1000100010001000ULL };
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 33dbb65..37a1869 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -47,6 +47,7 @@ extern const ymm_reg  ff_pw_512;
 extern const ymm_reg  ff_pw_1023;
 extern const ymm_reg  ff_pw_1024;
 extern const ymm_reg  ff_pw_2048;
+extern const ymm_reg  ff_pw_4095;
 extern const ymm_reg  ff_pw_4096;
 extern const ymm_reg  ff_pw_8192;
 extern const ymm_reg  ff_pw_m1;
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index d1a9514..47d2246 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
 decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
 decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
 
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+                                                       ptrdiff_t stride, \
+                                                       const uint8_t *l, \
+                                                       const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type,  4, bpp, opt4); \
+decl_ipred_fn(type,  8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
 #define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
 static av_always_inline void \
 ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
@@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
     init_subpel3_8to64(idx, type, bpp, opt); \
     init_subpel2(4, idx,  4, type, bpp, opt)
 
+#define cat(a, bpp, b) a##bpp##b
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+    dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+        cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  8, bpp, opt); \
+    init_ipred_func(type, enum, 16, bpp, opt); \
+    init_ipred_func(type, enum, 32, bpp, opt)
+
 void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
 void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
 void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index bd61e24..f4a4a5d 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -46,6 +46,11 @@ decl_fpel_func(avg,  32, _16, avx2);
 decl_fpel_func(avg,  64, _16, avx2);
 decl_fpel_func(avg, 128, _16, avx2);
 
+decl_ipred_fns(v,       16, mmx,    sse);
+decl_ipred_fns(h,       16, mmxext, sse2);
+decl_ipred_fns(dc,      16, mmxext, sse2);
+decl_ipred_fns(dc_top,  16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
 #endif /* HAVE_YASM */
 
 av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
@@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 
     if (EXTERNAL_MMX(cpu_flags)) {
         init_fpel_func(4, 0,   8, put, , mmx);
+        init_ipred_func(v, VERT, 4, 16, mmx);
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         init_fpel_func(4, 1,   8, avg, _16, mmxext);
+        init_ipred_func(h, HOR, 4, 16, mmxext);
+        init_ipred_func(dc, DC, 4, 16, mmxext);
+        init_ipred_func(dc_top,  TOP_DC,  4, 16, mmxext);
+        init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
     }
 
     if (EXTERNAL_SSE(cpu_flags)) {
@@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
         init_fpel_func(2, 0,  32, put, , sse);
         init_fpel_func(1, 0,  64, put, , sse);
         init_fpel_func(0, 0, 128, put, , sse);
+        init_8_16_32_ipred_funcs(v, VERT, 16, sse);
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
@@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
         init_fpel_func(2, 1,  32, avg, _16, sse2);
         init_fpel_func(1, 1,  64, avg, _16, sse2);
         init_fpel_func(0, 1, 128, avg, _16, sse2);
+        init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+        init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+        init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
+        init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
     }
 
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index 56cd79e..f486caf 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \
 lpf_mix2_wrappers_set(BPC, sse2);
 lpf_mix2_wrappers_set(BPC, ssse3);
 lpf_mix2_wrappers_set(BPC, avx);
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
 #endif /* HAVE_YASM */
 
 av_cold void INIT_FUNC(VP9DSPContext *dsp)
@@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
     init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
     init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
 
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+    }
+
     if (EXTERNAL_SSE2(cpu_flags)) {
         init_subpel3(0, put, BPC, sse2);
         init_subpel3(1, avg, BPC, sse2);
         init_lpf_funcs(BPC, sse2);
+        init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 0000000..d9df2c8
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,672 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+pd_16: times 8 dd 16
+pd_32: times 8 dd 32
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+
+SECTION .text
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    mova                    m3, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m2
+    mova   [dstq+strideq*1+48], m3
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+strideq*2+32], m2
+    mova   [dstq+strideq*2+48], m3
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    mova   [dstq+stride3q +32], m2
+    mova   [dstq+stride3q +48], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+    mova                    m3, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 4, 5, dst, stride, l, a
+    mova                    m4, [lq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    mov                   cntd, 2
+    lea               stride3q, [strideq*3]
+.loop:
+    pshufhw                 m0, m4, q3333
+    pshufhw                 m1, m4, q2222
+    pshufhw                 m2, m4, q1111
+    pshufhw                 m3, m4, q0000
+    punpckhqdq              m0, m0
+    punpckhqdq              m1, m1
+    punpckhqdq              m2, m2
+    punpckhqdq              m3, m3
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    movlhps                 m4, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 4, 6, dst, stride, l, a
+    mova                    m4, [lq]
+    mova                    m5, [lq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    mov                   cntd, 4
+    lea               stride3q, [strideq*3]
+.loop:
+    pshufhw                 m0, m5, q3333
+    pshuflw                 m1, m5, q3333
+    pshufhw                 m2, m4, q3333
+    pshuflw                 m3, m4, q3333
+    punpckhqdq              m0, m0
+    punpcklqdq              m1, m1
+    punpckhqdq              m2, m2
+    punpcklqdq              m3, m3
+    mova  [dstq+strideq *0+ 0], m0
+    mova  [dstq+strideq *0+16], m0
+    mova  [dstq+strideq *4+ 0], m1
+    mova  [dstq+strideq *4+16], m1
+    mova  [dstq+strideq *8+ 0], m2
+    mova  [dstq+strideq *8+16], m2
+    mova  [dstq+stride3q*4+ 0], m3
+    mova  [dstq+stride3q*4+16], m3
+    add                   dstq, strideq
+    pslldq                  m4, 2
+    pslldq                  m5, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 8, dst, stride, l, a
+    mova                    m4, [lq+mmsize*0]
+    mova                    m5, [lq+mmsize*1]
+    mova                    m6, [lq+mmsize*2]
+    mova                    m7, [lq+mmsize*3]
+    DEFINE_ARGS dst, stride, stride24, stride8, cnt
+    mov                   cntd, 8
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+.loop:
+    pshufhw                 m0, m7, q3333
+    pshufhw                 m1, m6, q3333
+    pshufhw                 m2, m5, q3333
+    pshufhw                 m3, m4, q3333
+    punpckhqdq              m0, m0
+    punpckhqdq              m1, m1
+    punpckhqdq              m2, m2
+    punpckhqdq              m3, m3
+    mova  [dstq+stride8q*0+ 0], m0
+    mova  [dstq+stride8q*0+16], m0
+    mova  [dstq+stride8q*0+32], m0
+    mova  [dstq+stride8q*0+48], m0
+    mova  [dstq+stride8q*1+ 0], m1
+    mova  [dstq+stride8q*1+16], m1
+    mova  [dstq+stride8q*1+32], m1
+    mova  [dstq+stride8q*1+48], m1
+    mova  [dstq+stride8q*2+ 0], m2
+    mova  [dstq+stride8q*2+16], m2
+    mova  [dstq+stride8q*2+32], m2
+    mova  [dstq+stride8q*2+48], m2
+    mova  [dstq+stride24q + 0], m3
+    mova  [dstq+stride24q +16], m3
+    mova  [dstq+stride24q +32], m3
+    mova  [dstq+stride24q +48], m3
+    add                   dstq, strideq
+    pslldq                  m4, 2
+    pslldq                  m5, 2
+    pslldq                  m6, 2
+    pslldq                  m7, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, m1
+    paddd                   m0, [pd_4]
+    psrad                   m0, 3
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, m1
+    paddd                   m0, [pd_8]
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [lq+mmsize]
+    paddw                   m0, [aq]
+    paddw                   m0, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, m1
+    paddd                   m0, [pd_16]
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq+mmsize*0]
+    paddw                   m0, [lq+mmsize*1]
+    paddw                   m0, [lq+mmsize*2]
+    paddw                   m0, [lq+mmsize*3]
+    paddw                   m0, [aq+mmsize*0]
+    paddw                   m0, [aq+mmsize*1]
+    paddw                   m0, [aq+mmsize*2]
+    paddw                   m0, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 8
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, m1
+    paddd                   m0, [pd_32]
+    psrad                   m0, 6
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+strideq*2+32], m0
+    mova   [dstq+strideq*2+48], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    mova   [dstq+stride3q +32], m0
+    mova   [dstq+stride3q +48], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, m1
+    paddd                   m0, [pd_2]
+    psrad                   m0, 2
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, m1
+    paddd                   m0, [pd_4]
+    psrad                   m0, 3
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    paddw                   m0, [%2+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, m1
+    paddd                   m0, [pd_8]
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2+mmsize*0]
+    paddw                   m0, [%2+mmsize*1]
+    paddw                   m0, [%2+mmsize*2]
+    paddw                   m0, [%2+mmsize*3]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 8
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, m1
+    paddd                   m0, [pd_16]
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+strideq*2+32], m0
+    mova   [dstq+strideq*2+48], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    mova   [dstq+stride3q +32], m0
+    mova   [dstq+stride3q +48], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+DC_1D_FNS top,  aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_1023]
+.body:
+    mova                    m4, [aq]
+    mova                    m3, [lq]
+    movd                    m0, [aq-2]
+    pshufw                  m0, m0, q0000
+    psubw                   m4, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    paddw                   m0, m4
+    paddw                   m1, m4
+    paddw                   m2, m4
+    paddw                   m3, m4
+    pxor                    m4, m4
+    pmaxsw                  m0, m4
+    pmaxsw                  m1, m4
+    pmaxsw                  m2, m4
+    pmaxsw                  m3, m4
+    pminsw                  m0, m5
+    pminsw                  m1, m5
+    pminsw                  m2, m5
+    pminsw                  m3, m5
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m5, [aq]
+    mova                    m4, [lq]
+    movd                    m0, [aq-2]
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 2
+.loop:
+    pshufhw                 m0, m4, q3333
+    pshufhw                 m1, m4, q2222
+    pshufhw                 m2, m4, q1111
+    pshufhw                 m3, m4, q0000
+    punpckhqdq              m0, m0
+    punpckhqdq              m1, m1
+    punpckhqdq              m2, m2
+    punpckhqdq              m3, m3
+    paddw                   m0, m5
+    paddw                   m1, m5
+    paddw                   m2, m5
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m7
+    pminsw                  m1, m7
+    pminsw                  m2, m7
+    pminsw                  m3, m7
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    movlhps                 m4, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m4, [aq]
+    mova                    m5, [aq+mmsize]
+    mova                    m2, [lq]
+    mova                    m3, [lq+mmsize]
+    movd                    m0, [aq-2]
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+.loop:
+    pshufhw                 m0, m3, q3333
+    pslldq                  m3, 2
+    punpckhqdq              m0, m0
+    paddw                   m1, m0, m5
+    paddw                   m0, m4
+    pmaxsw                  m1, m6
+    pmaxsw                  m0, m6
+    pminsw                  m1, m7
+    pminsw                  m0, m7
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+ 0], m0
+    pshufhw                 m0, m2, q3333
+    pslldq                  m2, 2
+    punpckhqdq              m0, m0
+    paddw                   m1, m0, m5
+    paddw                   m0, m4
+    pmaxsw                  m1, m6
+    pmaxsw                  m0, m6
+    pminsw                  m1, m7
+    pminsw                  m0, m7
+    mova   [dstq+strideq*8+16], m1
+    mova   [dstq+strideq*8+ 0], m0
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_1023]
+.body:
+    pxor                    m1, m1
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+    mova              [rsp+ 0], m0
+    mova              [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+    mova                    m4, [aq+mmsize*0]
+    mova                    m5, [aq+mmsize*1]
+    mova                    m6, [aq+mmsize*2]
+    mova                    m7, [aq+mmsize*3]
+    movd                    m0, [aq-2]
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    psubw                   m6, m0
+    psubw                   m7, m0
+    DEFINE_ARGS dst, stride, l, cnt, cnto
+    mov                  cntod, 4
+.loopo:
+    mova                    m3, [lq+mmsize*3]
+    sub                     lq, mmsize
+    mov                   cntd, 8
+.loop:
+    pshufhw                 m0, m3, q3333
+    pslldq                  m3, 2
+    punpckhqdq              m0, m0
+    paddw                   m1, m0, m4
+    paddw                   m2, m0, m5
+    pmaxsw                  m1, reg_min
+    pmaxsw                  m2, reg_min
+    pminsw                  m1, reg_max
+    pminsw                  m2, reg_max
+    mova   [dstq+strideq*0+ 0], m1
+    mova   [dstq+strideq*0+16], m2
+    paddw                   m1, m0, m6
+    paddw                   m0, m7
+    pmaxsw                  m1, reg_min
+    pmaxsw                  m0, reg_min
+    pminsw                  m1, reg_max
+    pminsw                  m0, reg_max
+    mova   [dstq+strideq*0+32], m1
+    mova   [dstq+strideq*0+48], m0
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    dec                  cntod
+    jg .loopo
+    RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm
index 51294f0..5d6dc62 100644
--- a/libavcodec/x86/vp9lpf_16bpp.asm
+++ b/libavcodec/x86/vp9lpf_16bpp.asm
@@ -26,7 +26,6 @@ SECTION_RODATA
 
 pw_511: times 16 dw 511
 pw_2047: times 16 dw 2047
-pw_4095: times 16 dw 4095
 pw_16384: times 16 dw 16384
 pw_m512: times 16 dw -512
 pw_m2048: times 16 dw -2048
@@ -38,6 +37,7 @@ cextern pw_8
 cextern pw_16
 cextern pw_256
 cextern pw_1023
+cextern pw_4095
 cextern pw_m1
 
 SECTION .text
diff --git a/libavcodec/x86/vp9mc_16bpp.asm b/libavcodec/x86/vp9mc_16bpp.asm
index f60dab7..9a462ea 100644
--- a/libavcodec/x86/vp9mc_16bpp.asm
+++ b/libavcodec/x86/vp9mc_16bpp.asm
@@ -24,10 +24,10 @@
 
 SECTION_RODATA 32
 
-pw_4095: times 16 dw 0xfff
 pd_64: times 8 dd 64
 
 cextern pw_1023
+cextern pw_4095
 
 SECTION .text
 
-- 
2.1.2



More information about the ffmpeg-devel mailing list