[FFmpeg-cvslog] x86inc: Use VEX-encoded instructions in AVX functions

Henrik Gramner git at videolan.org
Mon Oct 14 15:53:36 CEST 2013


ffmpeg | branch: master | Henrik Gramner <hengar-6 at student.ltu.se> | Tue Dec 11 16:05:34 2012 +0100| [c108ba0175d4fc3a3253a8b0f782fbfb96ba5098] | committer: Derek Buitenhuis

x86inc: Use VEX-encoded instructions in AVX functions

Automatically use VEX-encoding in AVX/AVX2/XOP/FMA3/FMA4
functions for all instructions that exists in a VEX-encoded
version.

This change makes it easier to extend existing code to use AVX2.

Also add support for AVX emulation of a few instructions that
were missing before.

Signed-off-by: Derek Buitenhuis <derek.buitenhuis at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c108ba0175d4fc3a3253a8b0f782fbfb96ba5098
---

 libavutil/x86/x86inc.asm |  253 +++++++++++++++++++++++++++++++---------------
 1 file changed, 169 insertions(+), 84 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index cd42caf..b2b67c5 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -775,7 +775,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
         %if cpuflag(avx)
             %assign avx_enabled 1
         %endif
-        %if mmsize == 16 && notcpuflag(sse2)
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
             %define mova movaps
             %define movu movups
             %define movnta movntps
@@ -861,10 +861,10 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %if ARCH_X86_64
     %define num_mmregs 16
     %endif
-    %define mova vmovaps
-    %define movu vmovups
+    %define mova movdqa
+    %define movu movdqu
     %undef movh
-    %define movnta vmovntps
+    %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, ymm %+ %%i
@@ -1045,101 +1045,107 @@ INIT_XMM
 
 ;%1 == instruction
 ;%2 == 1 if float, 0 if int
-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
-;%4 == number of operands given
+;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
 ;%5+: operands
-%macro RUN_AVX_INSTR 6-7+
-    %ifid %6
-        %define %%sizeofreg sizeof%6
-    %elifid %5
-        %define %%sizeofreg sizeof%5
+%macro RUN_AVX_INSTR 5-8+
+    %ifnum sizeof%6
+        %assign %%sizeofreg sizeof%6
+    %elifnum sizeof%5
+        %assign %%sizeofreg sizeof%5
     %else
-        %define %%sizeofreg mmsize
+        %assign %%sizeofreg mmsize
     %endif
-    %if %%sizeofreg==32
-        %if %4>=3
-            v%1 %5, %6, %7
-        %else
-            v%1 %5, %6
-        %endif
+    %assign %%emulate_avx 0
+    %if avx_enabled && %%sizeofreg >= 16
+        %xdefine %%instr v%1
     %else
-        %if %%sizeofreg==8
-            %define %%regmov movq
-        %elif %2
-            %define %%regmov movaps
-        %else
-            %define %%regmov movdqa
+        %xdefine %%instr %1
+        %if %0 >= 7+%3
+            %assign %%emulate_avx 1
         %endif
+    %endif
 
-        %if %4>=3+%3
-            %ifnidn %5, %6
-                %if avx_enabled && %%sizeofreg==16
-                    v%1 %5, %6, %7
-                %else
-                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
-                    %%regmov %5, %6
-                    %1 %5, %7
+    %if %%emulate_avx
+        %xdefine %%src1 %6
+        %xdefine %%src2 %7
+        %ifnidn %5, %6
+            %if %0 >= 8
+                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+            %endif
+            %if %4 && %3 == 0
+                %ifnid %7
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine %%src1 %7
+                    %xdefine %%src2 %6
                 %endif
+            %endif
+            %if %%sizeofreg == 8
+                MOVQ %5, %%src1
+            %elif %2
+                MOVAPS %5, %%src1
             %else
-                %1 %5, %7
+                MOVDQA %5, %%src1
             %endif
-        %elif %4>=3
-            %1 %5, %6, %7
-        %else
-            %1 %5, %6
         %endif
-    %endif
-%endmacro
-
-; 3arg AVX ops with a memory arg can only have it in src2,
-; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
-; So, if the op is symmetric and the wrong one is memory, swap them.
-%macro RUN_AVX_INSTR1 8
-    %assign %%swap 0
-    %if avx_enabled
-        %ifnid %6
-            %assign %%swap 1
-        %endif
-    %elifnidn %5, %6
-        %ifnid %7
-            %assign %%swap 1
+        %if %0 >= 8
+            %1 %5, %%src2, %8
+        %else
+            %1 %5, %%src2
         %endif
-    %endif
-    %if %%swap && %3 == 0 && %8 == 1
-        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+    %elif %0 >= 8
+        %%instr %5, %6, %7, %8
+    %elif %0 == 7
+        %%instr %5, %6, %7
+    %elif %0 == 6
+        %%instr %5, %6
     %else
-        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+        %%instr %5
     %endif
 %endmacro
 
 ;%1 == instruction
 ;%2 == 1 if float, 0 if int
-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
-;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 4
-    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
-        %ifidn %3, fnord
-            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-4 0, 1, 0
+    %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
         %elifidn %4, fnord
-            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
         %elifidn %5, fnord
-            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
         %else
-            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
         %endif
     %endmacro
 %endmacro
 
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
 AVX_INSTR addpd, 1, 0, 1
 AVX_INSTR addps, 1, 0, 1
 AVX_INSTR addsd, 1, 0, 1
 AVX_INSTR addss, 1, 0, 1
 AVX_INSTR addsubpd, 1, 0, 0
 AVX_INSTR addsubps, 1, 0, 0
-AVX_INSTR andpd, 1, 0, 1
-AVX_INSTR andps, 1, 0, 1
+AVX_INSTR aesdec, 0, 0, 0
+AVX_INSTR aesdeclast, 0, 0, 0
+AVX_INSTR aesenc, 0, 0, 0
+AVX_INSTR aesenclast, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
 AVX_INSTR andnpd, 1, 0, 0
 AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
 AVX_INSTR blendpd, 1, 0, 0
 AVX_INSTR blendps, 1, 0, 0
 AVX_INSTR blendvpd, 1, 0, 0
@@ -1148,19 +1154,39 @@ AVX_INSTR cmppd, 1, 1, 0
 AVX_INSTR cmpps, 1, 1, 0
 AVX_INSTR cmpsd, 1, 1, 0
 AVX_INSTR cmpss, 1, 1, 0
-AVX_INSTR cvtdq2ps, 1, 0, 0
-AVX_INSTR cvtpd2dq, 1, 0, 0
-AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR comisd
+AVX_INSTR comiss
+AVX_INSTR cvtdq2pd
+AVX_INSTR cvtdq2ps
+AVX_INSTR cvtpd2dq
+AVX_INSTR cvtpd2ps
+AVX_INSTR cvtps2dq
+AVX_INSTR cvtps2pd
+AVX_INSTR cvtsd2si
+AVX_INSTR cvtsd2ss
+AVX_INSTR cvtsi2sd
+AVX_INSTR cvtsi2ss
+AVX_INSTR cvtss2sd
+AVX_INSTR cvtss2si
+AVX_INSTR cvttpd2dq
+AVX_INSTR cvttps2dq
+AVX_INSTR cvttsd2si
+AVX_INSTR cvttss2si
 AVX_INSTR divpd, 1, 0, 0
 AVX_INSTR divps, 1, 0, 0
 AVX_INSTR divsd, 1, 0, 0
 AVX_INSTR divss, 1, 0, 0
 AVX_INSTR dppd, 1, 1, 0
 AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR extractps
 AVX_INSTR haddpd, 1, 0, 0
 AVX_INSTR haddps, 1, 0, 0
 AVX_INSTR hsubpd, 1, 0, 0
 AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR insertps, 1, 1, 0
+AVX_INSTR lddqu
+AVX_INSTR ldmxcsr
+AVX_INSTR maskmovdqu
 AVX_INSTR maxpd, 1, 0, 1
 AVX_INSTR maxps, 1, 0, 1
 AVX_INSTR maxsd, 1, 0, 1
@@ -1169,10 +1195,31 @@ AVX_INSTR minpd, 1, 0, 1
 AVX_INSTR minps, 1, 0, 1
 AVX_INSTR minsd, 1, 0, 1
 AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movapd
+AVX_INSTR movaps
+AVX_INSTR movd
+AVX_INSTR movddup
+AVX_INSTR movdqa
+AVX_INSTR movdqu
 AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movhpd, 1, 0, 0
+AVX_INSTR movhps, 1, 0, 0
 AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movlpd, 1, 0, 0
+AVX_INSTR movlps, 1, 0, 0
+AVX_INSTR movmskpd
+AVX_INSTR movmskps
+AVX_INSTR movntdq
+AVX_INSTR movntdqa
+AVX_INSTR movntpd
+AVX_INSTR movntps
+AVX_INSTR movq
 AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movshdup
+AVX_INSTR movsldup
 AVX_INSTR movss, 1, 0, 0
+AVX_INSTR movupd
+AVX_INSTR movups
 AVX_INSTR mpsadbw, 0, 1, 0
 AVX_INSTR mulpd, 1, 0, 1
 AVX_INSTR mulps, 1, 0, 1
@@ -1180,9 +1227,9 @@ AVX_INSTR mulsd, 1, 0, 1
 AVX_INSTR mulss, 1, 0, 1
 AVX_INSTR orpd, 1, 0, 1
 AVX_INSTR orps, 1, 0, 1
-AVX_INSTR pabsb, 0, 0, 0
-AVX_INSTR pabsw, 0, 0, 0
-AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR pabsb
+AVX_INSTR pabsd
+AVX_INSTR pabsw
 AVX_INSTR packsswb, 0, 0, 0
 AVX_INSTR packssdw, 0, 0, 0
 AVX_INSTR packuswb, 0, 0, 0
@@ -1202,10 +1249,11 @@ AVX_INSTR pavgb, 0, 0, 1
 AVX_INSTR pavgw, 0, 0, 1
 AVX_INSTR pblendvb, 0, 0, 0
 AVX_INSTR pblendw, 0, 1, 0
-AVX_INSTR pcmpestri, 0, 0, 0
-AVX_INSTR pcmpestrm, 0, 0, 0
-AVX_INSTR pcmpistri, 0, 0, 0
-AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pclmulqdq, 0, 1, 0
+AVX_INSTR pcmpestri
+AVX_INSTR pcmpestrm
+AVX_INSTR pcmpistri
+AVX_INSTR pcmpistrm
 AVX_INSTR pcmpeqb, 0, 0, 1
 AVX_INSTR pcmpeqw, 0, 0, 1
 AVX_INSTR pcmpeqd, 0, 0, 1
@@ -1214,12 +1262,21 @@ AVX_INSTR pcmpgtb, 0, 0, 0
 AVX_INSTR pcmpgtw, 0, 0, 0
 AVX_INSTR pcmpgtd, 0, 0, 0
 AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR pextrb
+AVX_INSTR pextrd
+AVX_INSTR pextrq
+AVX_INSTR pextrw
 AVX_INSTR phaddw, 0, 0, 0
 AVX_INSTR phaddd, 0, 0, 0
 AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phminposuw
 AVX_INSTR phsubw, 0, 0, 0
 AVX_INSTR phsubd, 0, 0, 0
 AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pinsrb, 0, 1, 0
+AVX_INSTR pinsrd, 0, 1, 0
+AVX_INSTR pinsrq, 0, 1, 0
+AVX_INSTR pinsrw, 0, 1, 0
 AVX_INSTR pmaddwd, 0, 0, 1
 AVX_INSTR pmaddubsw, 0, 0, 0
 AVX_INSTR pmaxsb, 0, 0, 1
@@ -1234,20 +1291,32 @@ AVX_INSTR pminsd, 0, 0, 1
 AVX_INSTR pminub, 0, 0, 1
 AVX_INSTR pminuw, 0, 0, 1
 AVX_INSTR pminud, 0, 0, 1
-AVX_INSTR pmovmskb, 0, 0, 0
-AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmovmskb
+AVX_INSTR pmovsxbw
+AVX_INSTR pmovsxbd
+AVX_INSTR pmovsxbq
+AVX_INSTR pmovsxwd
+AVX_INSTR pmovsxwq
+AVX_INSTR pmovsxdq
+AVX_INSTR pmovzxbw
+AVX_INSTR pmovzxbd
+AVX_INSTR pmovzxbq
+AVX_INSTR pmovzxwd
+AVX_INSTR pmovzxwq
+AVX_INSTR pmovzxdq
+AVX_INSTR pmuldq, 0, 0, 1
 AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhuw, 0, 0, 1
 AVX_INSTR pmulhw, 0, 0, 1
 AVX_INSTR pmullw, 0, 0, 1
 AVX_INSTR pmulld, 0, 0, 1
 AVX_INSTR pmuludq, 0, 0, 1
-AVX_INSTR pmuldq, 0, 0, 1
 AVX_INSTR por, 0, 0, 1
 AVX_INSTR psadbw, 0, 0, 1
 AVX_INSTR pshufb, 0, 0, 0
-AVX_INSTR pshufd, 0, 1, 0
-AVX_INSTR pshufhw, 0, 1, 0
-AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR pshufd
+AVX_INSTR pshufhw
+AVX_INSTR pshuflw
 AVX_INSTR psignb, 0, 0, 0
 AVX_INSTR psignw, 0, 0, 0
 AVX_INSTR psignd, 0, 0, 0
@@ -1269,7 +1338,7 @@ AVX_INSTR psubsb, 0, 0, 0
 AVX_INSTR psubsw, 0, 0, 0
 AVX_INSTR psubusb, 0, 0, 0
 AVX_INSTR psubusw, 0, 0, 0
-AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR ptest
 AVX_INSTR punpckhbw, 0, 0, 0
 AVX_INSTR punpckhwd, 0, 0, 0
 AVX_INSTR punpckhdq, 0, 0, 0
@@ -1279,11 +1348,27 @@ AVX_INSTR punpcklwd, 0, 0, 0
 AVX_INSTR punpckldq, 0, 0, 0
 AVX_INSTR punpcklqdq, 0, 0, 0
 AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR rcpps, 1, 0, 0
+AVX_INSTR rcpss, 1, 0, 0
+AVX_INSTR roundpd
+AVX_INSTR roundps
+AVX_INSTR roundsd
+AVX_INSTR roundss
+AVX_INSTR rsqrtps, 1, 0, 0
+AVX_INSTR rsqrtss, 1, 0, 0
+AVX_INSTR shufpd, 1, 1, 0
 AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR sqrtpd, 1, 0, 0
+AVX_INSTR sqrtps, 1, 0, 0
+AVX_INSTR sqrtsd, 1, 0, 0
+AVX_INSTR sqrtss, 1, 0, 0
+AVX_INSTR stmxcsr
 AVX_INSTR subpd, 1, 0, 0
 AVX_INSTR subps, 1, 0, 0
 AVX_INSTR subsd, 1, 0, 0
 AVX_INSTR subss, 1, 0, 0
+AVX_INSTR ucomisd
+AVX_INSTR ucomiss
 AVX_INSTR unpckhpd, 1, 0, 0
 AVX_INSTR unpckhps, 1, 0, 0
 AVX_INSTR unpcklpd, 1, 0, 0



More information about the ffmpeg-cvslog mailing list