[FFmpeg-devel] [PATCH] vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions.
Ronald S. Bultje
rsbultje at gmail.com
Fri Sep 25 22:36:48 CEST 2015
---
libavcodec/x86/Makefile | 1 +
libavcodec/x86/constants.c | 2 +
libavcodec/x86/constants.h | 1 +
libavcodec/x86/vp9dsp_init.h | 23 +
libavcodec/x86/vp9dsp_init_16bpp.c | 15 +
libavcodec/x86/vp9dsp_init_16bpp_template.c | 7 +
libavcodec/x86/vp9intrapred_16bpp.asm | 672 ++++++++++++++++++++++++++++
libavcodec/x86/vp9lpf_16bpp.asm | 2 +-
libavcodec/x86/vp9mc_16bpp.asm | 2 +-
9 files changed, 723 insertions(+), 2 deletions(-)
create mode 100644 libavcodec/x86/vp9intrapred_16bpp.asm
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 9cddaac..44bf342 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -157,6 +157,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
+ x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 553dd49..9f3c8b4 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -55,6 +55,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x040
0x0400040004000400ULL, 0x0400040004000400ULL};
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+ 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
0x1000100010001000ULL, 0x1000100010001000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 33dbb65..37a1869 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -47,6 +47,7 @@ extern const ymm_reg ff_pw_512;
extern const ymm_reg ff_pw_1023;
extern const ymm_reg ff_pw_1024;
extern const ymm_reg ff_pw_2048;
+extern const ymm_reg ff_pw_4095;
extern const ymm_reg ff_pw_4096;
extern const ymm_reg ff_pw_8192;
extern const ymm_reg ff_pw_m1;
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index d1a9514..47d2246 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+ ptrdiff_t stride, \
+ const uint8_t *l, \
+ const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type, 4, bpp, opt4); \
+decl_ipred_fn(type, 8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
@@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt)
+#define cat(a, bpp, b) a##bpp##b
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+ dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+ cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+ init_ipred_func(type, enum, 8, bpp, opt); \
+ init_ipred_func(type, enum, 16, bpp, opt); \
+ init_ipred_func(type, enum, 32, bpp, opt)
+
void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index bd61e24..f4a4a5d 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -46,6 +46,11 @@ decl_fpel_func(avg, 32, _16, avx2);
decl_fpel_func(avg, 64, _16, avx2);
decl_fpel_func(avg, 128, _16, avx2);
+decl_ipred_fns(v, 16, mmx, sse);
+decl_ipred_fns(h, 16, mmxext, sse2);
+decl_ipred_fns(dc, 16, mmxext, sse2);
+decl_ipred_fns(dc_top, 16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
#endif /* HAVE_YASM */
av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
@@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
if (EXTERNAL_MMX(cpu_flags)) {
init_fpel_func(4, 0, 8, put, , mmx);
+ init_ipred_func(v, VERT, 4, 16, mmx);
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
init_fpel_func(4, 1, 8, avg, _16, mmxext);
+ init_ipred_func(h, HOR, 4, 16, mmxext);
+ init_ipred_func(dc, DC, 4, 16, mmxext);
+ init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext);
+ init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
}
if (EXTERNAL_SSE(cpu_flags)) {
@@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 0, 32, put, , sse);
init_fpel_func(1, 0, 64, put, , sse);
init_fpel_func(0, 0, 128, put, , sse);
+ init_8_16_32_ipred_funcs(v, VERT, 16, sse);
}
if (EXTERNAL_SSE2(cpu_flags)) {
@@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 1, 32, avg, _16, sse2);
init_fpel_func(1, 1, 64, avg, _16, sse2);
init_fpel_func(0, 1, 128, avg, _16, sse2);
+ init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+ init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+ init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2);
+ init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index 56cd79e..f486caf 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \
lpf_mix2_wrappers_set(BPC, sse2);
lpf_mix2_wrappers_set(BPC, ssse3);
lpf_mix2_wrappers_set(BPC, avx);
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
#endif /* HAVE_YASM */
av_cold void INIT_FUNC(VP9DSPContext *dsp)
@@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+ }
+
if (EXTERNAL_SSE2(cpu_flags)) {
init_subpel3(0, put, BPC, sse2);
init_subpel3(1, avg, BPC, sse2);
init_lpf_funcs(BPC, sse2);
+ init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
}
if (EXTERNAL_SSSE3(cpu_flags)) {
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 0000000..d9df2c8
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,672 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+pd_16: times 8 dd 16
+pd_32: times 8 dd 32
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+
+SECTION .text
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ mova m1, [aq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0]
+ mova m1, [aq+mmsize*1]
+ mova m2, [aq+mmsize*2]
+ mova m3, [aq+mmsize*3]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*1+32], m2
+ mova [dstq+strideq*1+48], m3
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+strideq*2+32], m2
+ mova [dstq+strideq*2+48], m3
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m1
+ mova [dstq+stride3q +32], m2
+ mova [dstq+stride3q +48], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+ mova m3, [lq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufw m0, m3, q3333
+ pshufw m1, m3, q2222
+ pshufw m2, m3, q1111
+ pshufw m3, m3, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 4, 5, dst, stride, l, a
+ mova m4, [lq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 2
+ lea stride3q, [strideq*3]
+.loop:
+ pshufhw m0, m4, q3333
+ pshufhw m1, m4, q2222
+ pshufhw m2, m4, q1111
+ pshufhw m3, m4, q0000
+ punpckhqdq m0, m0
+ punpckhqdq m1, m1
+ punpckhqdq m2, m2
+ punpckhqdq m3, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ movlhps m4, m4
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 4, 6, dst, stride, l, a
+ mova m4, [lq]
+ mova m5, [lq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 4
+ lea stride3q, [strideq*3]
+.loop:
+ pshufhw m0, m5, q3333
+ pshuflw m1, m5, q3333
+ pshufhw m2, m4, q3333
+ pshuflw m3, m4, q3333
+ punpckhqdq m0, m0
+ punpcklqdq m1, m1
+ punpckhqdq m2, m2
+ punpcklqdq m3, m3
+ mova [dstq+strideq *0+ 0], m0
+ mova [dstq+strideq *0+16], m0
+ mova [dstq+strideq *4+ 0], m1
+ mova [dstq+strideq *4+16], m1
+ mova [dstq+strideq *8+ 0], m2
+ mova [dstq+strideq *8+16], m2
+ mova [dstq+stride3q*4+ 0], m3
+ mova [dstq+stride3q*4+16], m3
+ add dstq, strideq
+ pslldq m4, 2
+ pslldq m5, 2
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 8, dst, stride, l, a
+ mova m4, [lq+mmsize*0]
+ mova m5, [lq+mmsize*1]
+ mova m6, [lq+mmsize*2]
+ mova m7, [lq+mmsize*3]
+ DEFINE_ARGS dst, stride, stride24, stride8, cnt
+ mov cntd, 8
+ lea stride8q, [strideq*8]
+ lea stride24q, [stride8q*3]
+.loop:
+ pshufhw m0, m7, q3333
+ pshufhw m1, m6, q3333
+ pshufhw m2, m5, q3333
+ pshufhw m3, m4, q3333
+ punpckhqdq m0, m0
+ punpckhqdq m1, m1
+ punpckhqdq m2, m2
+ punpckhqdq m3, m3
+ mova [dstq+stride8q*0+ 0], m0
+ mova [dstq+stride8q*0+16], m0
+ mova [dstq+stride8q*0+32], m0
+ mova [dstq+stride8q*0+48], m0
+ mova [dstq+stride8q*1+ 0], m1
+ mova [dstq+stride8q*1+16], m1
+ mova [dstq+stride8q*1+32], m1
+ mova [dstq+stride8q*1+48], m1
+ mova [dstq+stride8q*2+ 0], m2
+ mova [dstq+stride8q*2+16], m2
+ mova [dstq+stride8q*2+32], m2
+ mova [dstq+stride8q*2+48], m2
+ mova [dstq+stride24q + 0], m3
+ mova [dstq+stride24q +16], m3
+ mova [dstq+stride24q +32], m3
+ mova [dstq+stride24q +48], m3
+ add dstq, strideq
+ pslldq m4, 2
+ pslldq m5, 2
+ pslldq m6, 2
+ pslldq m7, 2
+ dec cntd
+ jg .loop
+ RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufw m1, m0, q3232
+ paddd m0, m1
+ paddd m0, [pd_4]
+ psrad m0, 3
+ pshufw m0, m0, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, m1
+ paddd m0, [pd_8]
+ psrad m0, 4
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [lq+mmsize]
+ paddw m0, [aq]
+ paddw m0, [aq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, m1
+ paddd m0, [pd_16]
+ psrad m0, 5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq+mmsize*0]
+ paddw m0, [lq+mmsize*1]
+ paddw m0, [lq+mmsize*2]
+ paddw m0, [lq+mmsize*3]
+ paddw m0, [aq+mmsize*0]
+ paddw m0, [aq+mmsize*1]
+ paddw m0, [aq+mmsize*2]
+ paddw m0, [aq+mmsize*3]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 8
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, m1
+ paddd m0, [pd_32]
+ psrad m0, 6
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*1+32], m0
+ mova [dstq+strideq*1+48], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+strideq*2+32], m0
+ mova [dstq+strideq*2+48], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ mova [dstq+stride3q +32], m0
+ mova [dstq+stride3q +48], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufw m1, m0, q3232
+ paddd m0, m1
+ paddd m0, [pd_2]
+ psrad m0, 2
+ pshufw m0, m0, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, m1
+ paddd m0, [pd_4]
+ psrad m0, 3
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ paddw m0, [%2+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, m1
+ paddd m0, [pd_8]
+ psrad m0, 4
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2+mmsize*0]
+ paddw m0, [%2+mmsize*1]
+ paddw m0, [%2+mmsize*2]
+ paddw m0, [%2+mmsize*3]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 8
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, m1
+ paddd m0, [pd_16]
+ psrad m0, 5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*1+32], m0
+ mova [dstq+strideq*1+48], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+strideq*2+32], m0
+ mova [dstq+strideq*2+48], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ mova [dstq+stride3q +32], m0
+ mova [dstq+stride3q +48], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+DC_1D_FNS top, aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+ mova m5, [pw_1023]
+.body:
+ mova m4, [aq]
+ mova m3, [lq]
+ movd m0, [aq-2]
+ pshufw m0, m0, q0000
+ psubw m4, m0
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufw m0, m3, q3333
+ pshufw m1, m3, q2222
+ pshufw m2, m3, q1111
+ pshufw m3, m3, q0000
+ paddw m0, m4
+ paddw m1, m4
+ paddw m2, m4
+ paddw m3, m4
+ pxor m4, m4
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ pmaxsw m3, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ pminsw m2, m5
+ pminsw m3, m5
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_1023]
+.body:
+ pxor m6, m6
+ mova m5, [aq]
+ mova m4, [lq]
+ movd m0, [aq-2]
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ psubw m5, m0
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 2
+.loop:
+ pshufhw m0, m4, q3333
+ pshufhw m1, m4, q2222
+ pshufhw m2, m4, q1111
+ pshufhw m3, m4, q0000
+ punpckhqdq m0, m0
+ punpckhqdq m1, m1
+ punpckhqdq m2, m2
+ punpckhqdq m3, m3
+ paddw m0, m5
+ paddw m1, m5
+ paddw m2, m5
+ paddw m3, m5
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ pmaxsw m2, m6
+ pmaxsw m3, m6
+ pminsw m0, m7
+ pminsw m1, m7
+ pminsw m2, m7
+ pminsw m3, m7
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ movlhps m4, m4
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_1023]
+.body:
+ pxor m6, m6
+ mova m4, [aq]
+ mova m5, [aq+mmsize]
+ mova m2, [lq]
+ mova m3, [lq+mmsize]
+ movd m0, [aq-2]
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ psubw m4, m0
+ psubw m5, m0
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 8
+.loop:
+ pshufhw m0, m3, q3333
+ pslldq m3, 2
+ punpckhqdq m0, m0
+ paddw m1, m0, m5
+ paddw m0, m4
+ pmaxsw m1, m6
+ pmaxsw m0, m6
+ pminsw m1, m7
+ pminsw m0, m7
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+ 0], m0
+ pshufhw m0, m2, q3333
+ pslldq m2, 2
+ punpckhqdq m0, m0
+ paddw m1, m0, m5
+ paddw m0, m4
+ pmaxsw m1, m6
+ pmaxsw m0, m6
+ pminsw m1, m7
+ pminsw m0, m7
+ mova [dstq+strideq*8+16], m1
+ mova [dstq+strideq*8+ 0], m0
+ add dstq, strideq
+ dec cntd
+ jg .loop
+ RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a
+ mova m0, [pw_1023]
+.body:
+ pxor m1, m1
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+ mova [rsp+ 0], m0
+ mova [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+ mova m4, [aq+mmsize*0]
+ mova m5, [aq+mmsize*1]
+ mova m6, [aq+mmsize*2]
+ mova m7, [aq+mmsize*3]
+ movd m0, [aq-2]
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ psubw m4, m0
+ psubw m5, m0
+ psubw m6, m0
+ psubw m7, m0
+ DEFINE_ARGS dst, stride, l, cnt, cnto
+ mov cntod, 4
+.loopo:
+ mova m3, [lq+mmsize*3]
+ sub lq, mmsize
+ mov cntd, 8
+.loop:
+ pshufhw m0, m3, q3333
+ pslldq m3, 2
+ punpckhqdq m0, m0
+ paddw m1, m0, m4
+ paddw m2, m0, m5
+ pmaxsw m1, reg_min
+ pmaxsw m2, reg_min
+ pminsw m1, reg_max
+ pminsw m2, reg_max
+ mova [dstq+strideq*0+ 0], m1
+ mova [dstq+strideq*0+16], m2
+ paddw m1, m0, m6
+ paddw m0, m7
+ pmaxsw m1, reg_min
+ pmaxsw m0, reg_min
+ pminsw m1, reg_max
+ pminsw m0, reg_max
+ mova [dstq+strideq*0+32], m1
+ mova [dstq+strideq*0+48], m0
+ add dstq, strideq
+ dec cntd
+ jg .loop
+ dec cntod
+ jg .loopo
+ RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a
+ mova m0, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm
index 51294f0..5d6dc62 100644
--- a/libavcodec/x86/vp9lpf_16bpp.asm
+++ b/libavcodec/x86/vp9lpf_16bpp.asm
@@ -26,7 +26,6 @@ SECTION_RODATA
pw_511: times 16 dw 511
pw_2047: times 16 dw 2047
-pw_4095: times 16 dw 4095
pw_16384: times 16 dw 16384
pw_m512: times 16 dw -512
pw_m2048: times 16 dw -2048
@@ -38,6 +37,7 @@ cextern pw_8
cextern pw_16
cextern pw_256
cextern pw_1023
+cextern pw_4095
cextern pw_m1
SECTION .text
diff --git a/libavcodec/x86/vp9mc_16bpp.asm b/libavcodec/x86/vp9mc_16bpp.asm
index f60dab7..9a462ea 100644
--- a/libavcodec/x86/vp9mc_16bpp.asm
+++ b/libavcodec/x86/vp9mc_16bpp.asm
@@ -24,10 +24,10 @@
SECTION_RODATA 32
-pw_4095: times 16 dw 0xfff
pd_64: times 8 dd 64
cextern pw_1023
+cextern pw_4095
SECTION .text
--
2.1.2
More information about the ffmpeg-devel
mailing list