[FFmpeg-cvslog] x86/aacpsdsp: optimize ff_ps_stereo_interpolate_sse3

James Almer git at videolan.org
Sat Jun 3 18:41:42 EEST 2017


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Fri Jun  2 19:17:28 2017 -0300| [be3809a521fecfd3a61db99d660f243bd32b30bb] | committer: James Almer

x86/aacpsdsp: optimize ff_ps_stereo_interpolate_sse3

Move the unpacking outside of the loop. 5% to 10% faster.

Suggested-by: ubitux
Signed-off-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=be3809a521fecfd3a61db99d660f243bd32b30bb
---

 libavcodec/x86/aacpsdsp.asm | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index bb8a7f5df0..4548bb4257 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -93,6 +93,10 @@ cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
     movaps   m1, [h_stepq]
     cmp      nd, 0
     jle .ret
+    unpcklps m4, m0, m0
+    unpckhps m0, m0
+    unpcklps m5, m1, m1
+    unpckhps m1, m1
     shl      nd, 3
     add      lq, nq
     add      rq, nq
@@ -100,15 +104,12 @@ cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
 
 align 16
 .loop:
+    addps    m4, m5
     addps    m0, m1
     movddup  m2, [lq+nq]
     movddup  m3, [rq+nq]
-    movaps   m4, m0
-    movaps   m5, m0
-    unpcklps m4, m4
-    unpckhps m5, m5
     mulps    m2, m4
-    mulps    m3, m5
+    mulps    m3, m0
     addps    m2, m3
     movsd  [lq+nq], m2
     movhps [rq+nq], m2



More information about the ffmpeg-cvslog mailing list