[FFmpeg-cvslog] x86/hevc_mc: optimize AVX2 mc functions

James Almer git at videolan.org
Thu Feb 12 17:22:36 CET 2015


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Thu Feb 12 03:11:37 2015 -0300| [1679d68dbfae8414e1e805823758c02c17188dd4] | committer: James Almer

x86/hevc_mc: optimize AVX2 mc functions

Before
40766 decicycles in ff_hevc_put_hevc_qpel_h64_8_avx2, 8192 runs, 0 skips

After
37975 decicycles in ff_hevc_put_hevc_qpel_h64_8_avx2, 8192 runs, 0 skips

Reviewed-by: Christophe Gisquet <christophe.gisquet at gmail.com>
Signed-off-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1679d68dbfae8414e1e805823758c02c17188dd4
---

 libavcodec/x86/hevc_mc.asm |   32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 027daa8..9a4c9ca 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -489,14 +489,12 @@ QPEL_TABLE 10, 8, w, avx2
 %if %1 == 8
 %if cpuflag(avx2) && (%0 == 5)
 %if %2 > 16
-    vextracti128  xm10, m0, 1
-    vinserti128    m10, m1, xm10, 0
+    vperm2i128    m10, m0, m1, q0301
 %endif
     vinserti128    m0, m0, xm1, 1
     mova           m1, m10
 %if %2 > 16
-    vextracti128 xm10, m2, 1
-    vinserti128   m10, m3, xm10, 0
+    vperm2i128    m10, m2, m3, q0301
 %endif
     vinserti128    m2, m2, xm3, 1
     mova           m3, m10
@@ -583,26 +581,22 @@ QPEL_TABLE 10, 8, w, avx2
 %if %2 == 8
 %if cpuflag(avx2) && (%0 == 3)
 
-    vextracti128 xm10, m0, 1
-    vinserti128 m10, m1, xm10, 0
+    vperm2i128 m10, m0,  m1, q0301
     vinserti128 m0, m0, xm1, 1
-    mova m1, m10
+    SWAP 1, 10
 
-    vextracti128 xm10, m2, 1
-    vinserti128 m10, m3, xm10, 0
+    vperm2i128 m10, m2,  m3, q0301
     vinserti128 m2, m2, xm3, 1
-    mova m3, m10
+    SWAP 3, 10
 
 
-    vextracti128 xm10, m4, 1
-    vinserti128 m10, m5, xm10, 0
+    vperm2i128 m10, m4,  m5, q0301
     vinserti128 m4, m4, xm5, 1
-    mova m5, m10
+    SWAP 5, 10
 
-    vextracti128 xm10, m6, 1
-    vinserti128 m10, m7, xm10, 0
+    vperm2i128 m10, m6,  m7, q0301
     vinserti128 m6, m6, xm7, 1
-    mova m7, m10
+    SWAP 7, 10
 %endif
 
     pmaddubsw         m0, m12   ;x1*c1+x2*c2
@@ -889,8 +883,7 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
     EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
 %if cpuflag(avx2)
     vinserti128       m2, m0, xm4, 1
-    vextracti128      xm3, m0, 1
-    vinserti128       m3, m4, xm3, 0
+    vperm2i128        m3, m0, m4, q0301
     PEL_10STORE%1     dstq, m2, m3
 %else
     PEL_10STORE%1     dstq, m0, m4
@@ -1021,8 +1014,7 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride
     SIMPLE_BILOAD     %1, src2q, m8, m3
 %if cpuflag(avx2)
     vinserti128       m1, m8, xm3, 1
-    vextracti128      xm8, m8, 1
-    vinserti128       m2, m3, xm8, 0
+    vperm2i128        m2, m8, m3, q0301
     BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
 %else
     BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]



More information about the ffmpeg-cvslog mailing list