[FFmpeg-devel] [PATCH 2/6] diracdec: add 10-bit Legall 5, 3 (5_3) SIMD functions

James Darnley jdarnley at obe.tv
Thu Jul 19 17:52:48 EEST 2018


Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C:     94fps
SSE2: 118fps
AVX2: 121fps
---
 libavcodec/x86/dirac_dwt_10bit.asm    | 55 +++++++++++++++++++++++++++
 libavcodec/x86/dirac_dwt_init_10bit.c | 23 +++++++++++
 2 files changed, 78 insertions(+)

diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm
index dc3830615e..c00de32bfe 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -24,6 +24,7 @@
 SECTION_RODATA
 
 cextern pd_1
+pd_2: times 4 dd 2
 
 SECTION .text
 
@@ -100,9 +101,63 @@ REP_RET
 
 %endmacro
 
+%macro LEGALL53_VERTICAL_LO 0
+
+cglobal legall53_vertical_lo, 4, 4, 4, b0, b1, b2, w
+    mova m3, [pd_2]
+    shl wd, 2
+    add b0q, wq
+    add b1q, wq
+    add b2q, wq
+    neg wq
+
+    ALIGN 16
+    .loop:
+        mova m0, [b0q + wq]
+        mova m1, [b1q + wq]
+        mova m2, [b2q + wq]
+        paddd m0, m2
+        paddd m0, m3
+        psrad m0, 2
+        psubd m1, m0
+        mova [b1q + wq], m1
+        add wq, mmsize
+    jl .loop
+RET
+
+%endmacro
+
+%macro LEGALL53_VERTICAL_HI 0
+
+cglobal legall53_vertical_hi, 4, 4, 4, b0, b1, b2, w
+    mova m3, [pd_1]
+    shl wd, 2
+    add b0q, wq
+    add b1q, wq
+    add b2q, wq
+    neg wq
+
+    ALIGN 16
+    .loop:
+        mova m0, [b0q + wq]
+        mova m1, [b1q + wq]
+        mova m2, [b2q + wq]
+        paddd m0, m2
+        paddd m0, m3
+        psrad m0, 1
+        paddd m1, m0
+        mova [b1q + wq], m1
+        add wq, mmsize
+    jl .loop
+RET
+
+%endmacro
+
 INIT_XMM sse2
 HAAR_HORIZONTAL
 HAAR_VERTICAL
+LEGALL53_VERTICAL_HI
+LEGALL53_VERTICAL_LO
 
 INIT_XMM avx
 HAAR_HORIZONTAL
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c
index 939950e3ff..88cf267d14 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -23,6 +23,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dirac_dwt.h"
 
+void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
+void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
+
 void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align);
 void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align);
 void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align);
@@ -91,6 +94,22 @@ static void horizontal_compose_haar_avx2(int32_t *b, int32_t *tmp, int width)
     }
 }
 
+static void legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width)
+{
+    int i = width & ~3;
+    ff_legall53_vertical_lo_sse2(b0, b1, b2, i);
+    for(; i<width; i++)
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]);
+}
+
+static void legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width)
+{
+    int i = width & ~3;
+    ff_legall53_vertical_hi_sse2(b0, b1, b2, i);
+    for(; i<width; i++)
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
+}
+
 av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
 {
 #if HAVE_X86ASM
@@ -98,6 +117,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         switch (type) {
+            case DWT_DIRAC_LEGALL5_3:
+                d->vertical_compose_h0 = (void*)legall53_vertical_hi_sse2;
+                d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;
+                break;
             case DWT_DIRAC_HAAR0:
                 d->vertical_compose = (void*)vertical_compose_haar_sse2;
                 break;
-- 
2.17.1



More information about the ffmpeg-devel mailing list