[FFmpeg-cvslog] avcodec/utvideodec : add SIMD (SSSE3 and AVX2) for gradient_pred

Martin Vignali git at videolan.org
Sat Dec 9 16:20:18 EET 2017


ffmpeg | branch: master | Martin Vignali <martin.vignali at gmail.com> | Sat Dec  2 19:46:42 2017 +0100| [630967ef63d0f2a5cc12b06815af0ec6cb5c9d2a] | committer: Martin Vignali

avcodec/utvideodec : add SIMD (SSSE3 and AVX2) for gradient_pred

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=630967ef63d0f2a5cc12b06815af0ec6cb5c9d2a
---

 libavcodec/lossless_videodsp.c          | 11 +++++
 libavcodec/lossless_videodsp.h          |  1 +
 libavcodec/utvideodec.c                 |  5 ++-
 libavcodec/x86/lossless_videodsp.asm    | 80 +++++++++++++++++++++++++++++++++
 libavcodec/x86/lossless_videodsp_init.c |  5 +++
 5 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c
index b5b96e6129..cff94c234d 100644
--- a/libavcodec/lossless_videodsp.c
+++ b/libavcodec/lossless_videodsp.c
@@ -98,6 +98,16 @@ static int add_left_pred_int16_c(uint16_t *dst, const uint16_t *src, unsigned ma
     return acc;
 }
 
+static void add_gradient_pred_c(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width){
+    int A, B, C, i;
+
+    for (i = 0; i < width; i++) {
+        A = src[i - stride];
+        B = src[i - (stride + 1)];
+        C = src[i - 1];
+        src[i] = (A - B + C + src[i]) & 0xFF;
+    }
+}
 
 void ff_llviddsp_init(LLVidDSPContext *c)
 {
@@ -106,6 +116,7 @@ void ff_llviddsp_init(LLVidDSPContext *c)
     c->add_left_pred              = add_left_pred_c;
 
     c->add_left_pred_int16        = add_left_pred_int16_c;
+    c->add_gradient_pred          = add_gradient_pred_c;
 
     if (ARCH_PPC)
         ff_llviddsp_init_ppc(c);
diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h
index ccab39bac6..8077898d1a 100644
--- a/libavcodec/lossless_videodsp.h
+++ b/libavcodec/lossless_videodsp.h
@@ -39,6 +39,7 @@ typedef struct LLVidDSPContext {
 
     int  (*add_left_pred_int16)(uint16_t *dst, const uint16_t *src,
                                 unsigned mask, ptrdiff_t w, unsigned left);
+    void (*add_gradient_pred)(uint8_t *src /* align 32 */, const ptrdiff_t stride, const ptrdiff_t width);
 } LLVidDSPContext;
 
 void ff_llviddsp_init(LLVidDSPContext *llviddsp);
diff --git a/libavcodec/utvideodec.c b/libavcodec/utvideodec.c
index d2da825fbf..b85cb5daa6 100644
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@@ -460,6 +460,7 @@ static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t s
     uint8_t *bsrc;
     int slice_start, slice_height;
     const int cmask = ~rmode;
+    int min_width = FFMIN(width, 32);
 
     for (slice = 0; slice < slices; slice++) {
         slice_start  = ((slice * height) / slices) & cmask;
@@ -479,12 +480,14 @@ static void restore_gradient_planar(UtvideoContext *c, uint8_t *src, ptrdiff_t s
         for (j = 1; j < slice_height; j++) {
             // second line - first element has top prediction, the rest uses gradient
             bsrc[0] = (bsrc[0] + bsrc[-stride]) & 0xFF;
-            for (i = 1; i < width; i++) {
+            for (i = 1; i < min_width; i++) { /* dsp need align 32 */
                 A = bsrc[i - stride];
                 B = bsrc[i - (stride + 1)];
                 C = bsrc[i - 1];
                 bsrc[i] = (A - B + C + bsrc[i]) & 0xFF;
             }
+            if (width > 32)
+                c->llviddsp.add_gradient_pred(bsrc + 32, stride, width - 32);
             bsrc += stride;
         }
     }
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
index cfa0620fd1..9a169fe314 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -2,6 +2,7 @@
 ;* SIMD lossless video DSP utils
 ;* Copyright (c) 2008 Loren Merritt
 ;* Copyright (c) 2014 Michael Niedermayer
+;* Copyright (c) 2017 Jokyo Images
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -325,3 +326,82 @@ cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
     ADD_HFYU_LEFT_LOOP_INT16 u, a
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP_INT16 u, u
+
+
+;---------------------------------------------------------------------------------------------
+; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
+;---------------------------------------------------------------------------------------------
+%macro ADD_GRADIENT_PRED 0
+cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
+    mova         xm0, [pb_15]
+
+;load src - 1 in xm1
+    movd         xm1, [srcq-1]
+%if cpuflag(avx2)
+    vpbroadcastb xm1, xm1
+%else
+    pxor         xm2, xm2
+    pshufb       xm1, xm2
+%endif
+
+    add    srcq, widthq
+    neg  widthq
+    neg strideq
+
+.loop:
+    lea    tmpq, [srcq + strideq]
+    mova     m2, [tmpq + widthq] ; A = src[x-stride]
+    movu     m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
+    mova     m4, [srcq + widthq] ; current val (src[x])
+
+    psubb    m2, m3; A - B
+
+; prefix sum A-B
+    pslldq   m3, m2, 1
+    paddb    m2, m3
+    pslldq   m3, m2, 2
+    paddb    m2, m3
+    pslldq   m3, m2, 4
+    paddb    m2, m3
+    pslldq   m3, m2, 8
+    paddb    m2, m3
+
+; prefix sum current val
+    pslldq   m3, m4, 1
+    paddb    m4, m3
+    pslldq   m3, m4, 2
+    paddb    m4, m3
+    pslldq   m3, m4, 4
+    paddb    m4, m3
+    pslldq   m3, m4, 8
+    paddb    m4, m3
+
+; last sum
+    paddb                    m2, m4 ; current + (A - B)
+
+    paddb                   xm1, xm2 ; += C
+    mova        [srcq + widthq], xm1 ; store
+
+    pshufb                  xm1, xm0 ; put last val in all val of xm1
+
+%if mmsize == 32
+    vextracti128            xm2, m2, 1 ; get second lane of the ymm
+    paddb                   xm1, xm2; += C
+
+    mova   [srcq + widthq + 16], xm1 ; store
+    pshufb                  xm1, xm0 ; put last val in all val of m1
+%endif
+
+    add         widthq, mmsize
+    jl .loop
+    RET
+
+%endmacro
+
+INIT_XMM ssse3
+ADD_GRADIENT_PRED
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+ADD_GRADIENT_PRED
+%endif
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
index beae317cc2..e3063de462 100644
--- a/libavcodec/x86/lossless_videodsp_init.c
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -44,6 +44,9 @@ int  ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
 int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
 int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
 
+void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+void ff_add_gradient_pred_avx2(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width);
+
 #if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
 static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top,
                                  const uint8_t *diff, ptrdiff_t w,
@@ -109,6 +112,7 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
     if (EXTERNAL_SSSE3(cpu_flags)) {
         c->add_left_pred = ff_add_left_pred_ssse3;
         c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
+        c->add_gradient_pred   = ff_add_gradient_pred_ssse3;
     }
 
     if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
@@ -121,5 +125,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         c->add_bytes       = ff_add_bytes_avx2;
         c->add_left_pred   = ff_add_left_pred_unaligned_avx2;
+        c->add_gradient_pred = ff_add_gradient_pred_avx2;
     }
 }



More information about the ffmpeg-cvslog mailing list