[FFmpeg-devel] [PATCH 5/9] x86: proresdsp: simple_idct: free or use 1 xmm reg

Christophe Gisquet christophe.gisquet at gmail.com
Sun Oct 11 16:06:09 CEST 2015


m15 is zeroed but never used. If it's not needed, decrease by 1 the
number of xmm regs used (prores), otherwise, make use of it, for the
rounder in the row pass of simple_idct.
---
 libavcodec/x86/proresdsp.asm              |  8 ++++----
 libavcodec/x86/simple_idct10.asm          |  9 +++++----
 libavcodec/x86/simple_idct10_template.asm | 17 ++++++++---------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 18cf15b..3fb71ba 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -37,17 +37,17 @@ cextern pw_1019
 
 section .text align=16
 
-%macro idct_put_fn 1
-cglobal prores_idct_put_10, 4, 4, %1
+%macro idct_put_fn 0
+cglobal prores_idct_put_10, 4, 4, 15
     IDCT_PUT_FN    pw_1, 15, pw_88, 18, pw_4, pw_1019, r3
     RET
 %endmacro
 
 INIT_XMM sse2
-idct_put_fn 16
+idct_put_fn
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
-idct_put_fn 16
+idct_put_fn
 %endif
 
 %endif
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index 982fb1e..cd2b905 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -37,17 +37,18 @@ pd_round: times 4 dd 1<<(12-1)
 
 section .text align=16
 
-%macro idct_put_fn 1
-cglobal simple_idct10_put, 3, 3, %1
+%macro idct_put_fn 0
+cglobal simple_idct10_put, 3, 3, 16
+    mova          m15, [pd_round]
     IDCT_PUT_FN    "", 12, pw_16, 19, 0, pw_1023
     RET
 %endmacro
 
 INIT_XMM sse2
-idct_put_fn 16
+idct_put_fn
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
-idct_put_fn 16
+idct_put_fn
 %endif
 
 %endif
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
index 86c2765..d4a08f8 100644
--- a/libavcodec/x86/simple_idct10_template.asm
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -90,14 +90,14 @@ cextern w7_min_w5
     pmaddwd     m1, [w4_plus_w2]
 %ifstr %1
     ; 1<<(%1-1)
-    paddd       m2, [pd_round]
-    paddd       m3, [pd_round]
-    paddd       m4, [pd_round]
-    paddd       m5, [pd_round]
-    paddd       m6, [pd_round]
-    paddd       m7, [pd_round]
-    paddd       m0, [pd_round]
-    paddd       m1, [pd_round]
+    paddd       m2, m15
+    paddd       m3, m15
+    paddd       m4, m15
+    paddd       m5, m15
+    paddd       m6, m15
+    paddd       m7, m15
+    paddd       m0, m15
+    paddd       m1, m15
 %endif
 
     ; a0: -1*row[0]-1*row[2]
@@ -237,7 +237,6 @@ cextern w7_min_w5
 
 %macro IDCT_PUT_FN 6-7
     movsxd      r1,  r1d
-    pxor        m15, m15           ; zero
 
     ; for (i = 0; i < 8; i++)
     ;     idctRowCondDC(block + i*8);
-- 
2.6.0



More information about the ffmpeg-devel mailing list