[FFmpeg-cvslog] x86/tx_float: add asm call versions of the 2pt and 4pt transforms

Mon Sep 19 07:01:22 EEST 2022

ffmpeg | branch: master | Lynne <dev at lynne.ee> | Mon Sep 19 04:14:52 2022 +0200| [4ba68639cabfc56ffe62d4e776a8af040e551ff3] | committer: Lynne

x86/tx_float: add asm call versions of the 2pt and 4pt transforms

Verified to be working.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4ba68639cabfc56ffe62d4e776a8af040e551ff3
---

 libavutil/x86/tx_float.asm    | 26 +++++++++++++++++++++++---
 libavutil/x86/tx_float_init.c |  9 +++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index b644db49be..b3a85a7cb9 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -682,15 +682,27 @@ SECTION .text
 %endmacro
 
 INIT_XMM sse3
+cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride
+    movaps m0, [inq]
+    FFT2 m0, m1
+    movaps [outq], m0
+    add inq, mmsize*1
+    add outq, mmsize*1
+    ret
+
 cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
     movaps m0, [inq]
     FFT2 m0, m1
     movaps [outq], m0
     RET
 
-%macro FFT4 2
+%macro FFT4_FN 3
 INIT_XMM sse2
+%if %3
+cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride
+%else
 cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
+%endif
     movaps m0, [inq + 0*mmsize]
     movaps m1, [inq + 1*mmsize]
 
@@ -708,11 +720,19 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
     movaps [outq + 0*mmsize], m2
     movaps [outq + 1*mmsize], m0
 
+%if %3
+    add inq, mmsize*2
+    add outq, mmsize*2
+    ret
+%else
     RET
+%endif
 %endmacro
 
-FFT4 fwd, 0
-FFT4 inv, 1
+FFT4_FN fwd, 0, 0
+FFT4_FN fwd, 0, 1
+FFT4_FN inv, 1, 0
+FFT4_FN inv, 1, 1
 
 %macro FFT8_SSE_FN 1
 INIT_XMM sse3
diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c
index 25de7b3ec6..06df749fa9 100644
--- a/libavutil/x86/tx_float_init.c
+++ b/libavutil/x86/tx_float_init.c
@@ -45,6 +45,9 @@ TX_DECL_FN(fft_sr_ns, avx2)
 
 TX_DECL_FN(mdct_sr_inv, avx2)
 
+TX_DECL_FN(fft2_asm, sse3)
+TX_DECL_FN(fft4_fwd_asm, sse2)
+TX_DECL_FN(fft4_inv_asm, sse2)
 TX_DECL_FN(fft8_asm, sse3)
 TX_DECL_FN(fft8_asm, avx)
 TX_DECL_FN(fft16_asm, avx)
@@ -101,8 +104,14 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
 
 const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
     TX_DEF(fft2,     FFT,  2,  2, 2, 0, 128, NULL,  sse3, SSE3, AV_TX_INPLACE, 0),
+    TX_DEF(fft2_asm, FFT,  2,  2, 2, 0, 192, b8_i0, sse3, SSE3,
+           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
     TX_DEF(fft2,     FFT,  2,  2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
     TX_DEF(fft4_fwd, FFT,  4,  4, 2, 0, 128, NULL,  sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
+    TX_DEF(fft4_fwd_asm, FFT,  4,  4, 2, 0, 192, b8_i0, sse2, SSE2,
+           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
+    TX_DEF(fft4_inv_asm, FFT,  4,  4, 2, 0, 128, NULL,  sse2, SSE2,
+           AV_TX_INPLACE | FF_TX_INVERSE_ONLY | FF_TX_ASM_CALL, 0),
     TX_DEF(fft4_fwd, FFT,  4,  4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
     TX_DEF(fft4_inv, FFT,  4,  4, 2, 0, 128, NULL,  sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
     TX_DEF(fft8,     FFT,  8,  8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),