[FFmpeg-cvslog] vp9: add 10bpp simd (mmxext/ssse3) for idct_idct_4x4.
Ronald S. Bultje
git at videolan.org
Tue Oct 13 17:06:44 CEST 2015
ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Tue Oct 6 11:03:45 2015 -0400| [6b579cf547a75a0cbda5cb7f10eab9ca07522b0a] | committer: Ronald S. Bultje
vp9: add 10bpp simd (mmxext/ssse3) for idct_idct_4x4.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6b579cf547a75a0cbda5cb7f10eab9ca07522b0a
---
libavcodec/x86/constants.c | 2 +
libavcodec/x86/constants.h | 1 +
libavcodec/x86/vp9dsp_init_16bpp_template.c | 12 ++++
libavcodec/x86/vp9itxfm.asm | 50 +-------------
libavcodec/x86/vp9itxfm_16bpp.asm | 96 +++++++++++++++++++++++++++
libavcodec/x86/vp9itxfm_template.asm | 47 +++++++++++++
6 files changed, 159 insertions(+), 49 deletions(-)
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 9592fa7..7e3d490 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -85,6 +85,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x000
0x0000001000000010ULL, 0x0000001000000010ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
+ 0x0000200000002000ULL, 0x0000200000002000ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 3605b63..496933f 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -65,6 +65,7 @@ extern const xmm_reg ff_ps_neg;
extern const ymm_reg ff_pd_1;
extern const ymm_reg ff_pd_16;
extern const ymm_reg ff_pd_32;
+extern const ymm_reg ff_pd_8192;
extern const ymm_reg ff_pd_65535;
# if ARCH_X86_64
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index 7a56c3b..6e12af3 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -125,6 +125,10 @@ lpf_mix2_wrappers_set(BPC, avx);
decl_ipred_fns(tm, BPC, mmxext, sse2);
decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+decl_itxfm_func(idct, idct, 4, BPC, mmxext);
+decl_itxfm_func(idct, idct, 4, BPC, ssse3);
+#endif
#endif /* HAVE_YASM */
av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
@@ -170,6 +174,9 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
if (!bitexact) {
init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+ init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext);
+#endif
}
}
@@ -182,6 +189,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
if (EXTERNAL_SSSE3(cpu_flags)) {
init_lpf_funcs(BPC, ssse3);
+#if BPC == 10
+ if (!bitexact) {
+ init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, ssse3);
+ }
+#endif
}
if (EXTERNAL_AVX(cpu_flags)) {
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index c564f27..200f15e 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -71,8 +71,6 @@ pw_13377x2: times 8 dw 13377*2
pw_m13377_13377: times 4 dw -13377, 13377
pw_13377_0: times 4 dw 13377, 0
-pd_8192: times 4 dd 8192
-
cextern pw_8
cextern pw_16
cextern pw_32
@@ -80,38 +78,10 @@ cextern pw_512
cextern pw_1024
cextern pw_2048
cextern pw_m1
+cextern pd_8192
SECTION .text
-; (a*x + b*y + round) >> shift
-%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
- pmaddwd m%1, m%2, %4
- pmaddwd m%2, %5
- paddd m%1, %3
- paddd m%2, %3
- psrad m%1, 14
- psrad m%2, 14
-%endmacro
-
-%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
- VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3]
- VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3]
- packssdw m%1, m%7
- packssdw m%2, m%6
-%endmacro
-
-%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
-%if %0 == 7
- punpckhwd m%6, m%2, m%1
- punpcklwd m%2, m%1
- VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7
-%else
- punpckhwd m%8, m%4, m%3
- punpcklwd m%2, m%4, m%3
- VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9
-%endif
-%endmacro
-
%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
punpckhwd m%4, m%2, m%1
punpcklwd m%2, m%1
@@ -191,24 +161,6 @@ cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;-------------------------------------------------------------------------------------------
-%macro VP9_IDCT4_1D_FINALIZE 0
- SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
- SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
- SWAP 0, 3, 2 ; 3102 -> 0123
-%endmacro
-
-%macro VP9_IDCT4_1D 0
-%if cpuflag(ssse3)
- SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
- pmulhrsw m2, m6 ; m2=t0
- pmulhrsw m0, m6 ; m0=t1
-%else ; <= sse2
- VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0
-%endif
- VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
- VP9_IDCT4_1D_FINALIZE
-%endmacro
-
; 2x2 top left corner
%macro VP9_IDCT4_2x2_1D 0
pmulhrsw m0, m5 ; m0=t1
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm
index 58869e6..e067438 100644
--- a/libavcodec/x86/vp9itxfm_16bpp.asm
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -25,8 +25,18 @@
SECTION_RODATA
+cextern pw_8
cextern pw_1023
+cextern pw_2048
cextern pw_4095
+cextern pd_8192
+
+; FIXME these should probably be shared between 8bpp and 10/12bpp
+pw_m11585_11585: times 4 dw -11585, 11585
+pw_11585_11585: times 8 dw 11585
+pw_m15137_6270: times 4 dw -15137, 6270
+pw_6270_15137: times 4 dw 6270, 15137
+pw_11585x2: times 8 dw 11585*2
SECTION .text
@@ -118,3 +128,89 @@ INIT_MMX mmxext
IWHT4_FN 10, 1023
INIT_MMX mmxext
IWHT4_FN 12, 4095
+
+; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
+; in 15+1 words without additional effort, since the coefficients are 15bpp.
+
+%macro IDCT4_10_FN 0
+cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
+ cmp eobd, 1
+ jg .idctfull
+
+ ; dc-only
+%if cpuflag(ssse3)
+ movd m0, [blockq]
+ mova m5, [pw_11585x2]
+ pmulhrsw m0, m5
+ pmulhrsw m0, m5
+%else
+ DEFINE_ARGS dst, stride, block, coef
+ mov coefd, dword [blockq]
+ imul coefd, 11585
+ add coefd, 8192
+ sar coefd, 14
+ imul coefd, 11585
+ add coefd, (8 << 14) + 8192
+ sar coefd, 14 + 4
+ movd m0, coefd
+%endif
+ pshufw m0, m0, 0
+ pxor m4, m4
+ mova m5, [pw_1023]
+ movh [blockq], m4
+%if cpuflag(ssse3)
+ pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+ VP9_STORE_2X 0, 0, 6, 7, 4, 5
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4, 5
+ RET
+
+.idctfull:
+ mova m0, [blockq+0*16+0]
+ mova m1, [blockq+1*16+0]
+ packssdw m0, [blockq+0*16+8]
+ packssdw m1, [blockq+1*16+8]
+ mova m2, [blockq+2*16+0]
+ mova m3, [blockq+3*16+0]
+ packssdw m2, [blockq+2*16+8]
+ packssdw m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+ mova m7, [pd_8192] ; rounding
+ VP9_IDCT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT4_1D
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+%if cpuflag(ssse3)
+ mova m5, [pw_2048]
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+%else
+ mova m5, [pw_8]
+ paddw m0, m5
+ paddw m1, m5
+ paddw m2, m5
+ paddw m3, m5
+ psraw m0, 4
+ psraw m1, 4
+ psraw m2, 4
+ psraw m3, 4
+%endif
+ mova m5, [pw_1023]
+ VP9_STORE_2X 0, 1, 6, 7, 4, 5
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 2, 3, 6, 7, 4, 5
+ RET
+%endmacro
+
+INIT_MMX mmxext
+IDCT4_10_FN
+INIT_MMX ssse3
+IDCT4_10_FN
diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm
index 43cf3aa..f1a05a5 100644
--- a/libavcodec/x86/vp9itxfm_template.asm
+++ b/libavcodec/x86/vp9itxfm_template.asm
@@ -35,3 +35,50 @@
paddw m3, m2
SWAP 3, 2, 1
%endmacro
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+ pmaddwd m%1, m%2, %4
+ pmaddwd m%2, %5
+ paddd m%1, %3
+ paddd m%2, %3
+ psrad m%1, 14
+ psrad m%2, 14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+ VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3]
+ VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3]
+ packssdw m%1, m%7
+ packssdw m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+ punpckhwd m%6, m%2, m%1
+ punpcklwd m%2, m%1
+ VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7
+%else
+ punpckhwd m%8, m%4, m%3
+ punpcklwd m%2, m%4, m%3
+ VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+ SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
+ SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
+ SWAP 0, 3, 2 ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+%if cpuflag(ssse3)
+ SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+ pmulhrsw m2, m6 ; m2=t0
+ pmulhrsw m0, m6 ; m0=t1
+%else ; <= sse2
+ VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0
+%endif
+ VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
+ VP9_IDCT4_1D_FINALIZE
+%endmacro
More information about the ffmpeg-cvslog
mailing list