[FFmpeg-devel] [PATCH 01/10] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped

Rostislav Pehlivanov rpehlivanov at ob-encoder.com
Thu Jun 23 19:06:55 CEST 2016


Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
---
 libavcodec/x86/diracdsp.asm    | 47 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/diracdsp_init.c |  6 ++++++
 2 files changed, 53 insertions(+)

diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index a042413..9db7b67 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -22,6 +22,8 @@
 
 SECTION_RODATA
 pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit:                times 8 dw 0x3ff
 
 cextern pw_3
 cextern pw_16
@@ -172,6 +174,48 @@ cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w,
     RET
 %endm
 
+%macro PUT_RECT_10 0
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
+
+    neg      wq
+    neg      hq
+    mov      r6, srcq
+    mov      r7, dstq
+    mov      r8, wq
+    pxor     m2, m2
+    mova     m3, [clip_10bit]
+    mova     m4, [convert_to_unsigned_10bit]
+
+    .loop_h:
+    mov      srcq, r6
+    mov      dstq, r7
+    mov      wq,   r8
+
+    .loop_w:
+    movu     m0, [srcq+0*mmsize]
+    movu     m1, [srcq+1*mmsize]
+
+    paddd    m0, m4
+    paddd    m1, m4
+    packusdw m0, m0, m1
+    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
+
+    movu     [dstq], m0
+
+    add      srcq, 2*mmsize
+    add      dstq, 1*mmsize
+    add      wq, 8
+    jl       .loop_w
+
+    add      r6, src_strideq
+    add      r7, dst_strideq
+    add      hq, 1
+    jl       .loop_h
+
+    RET
+%endm
+
 %macro ADD_RECT 1
 ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
 cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
@@ -263,3 +307,6 @@ ADD_RECT sse2
 HPEL_FILTER sse2
 ADD_OBMC 32, sse2
 ADD_OBMC 16, sse2
+
+INIT_XMM sse4
+PUT_RECT_10
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index 5fae798..4786eea 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -46,6 +46,8 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src,
 void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+
 #if HAVE_YASM
 
 #define HPEL_FILTER(MMSIZE, EXT)                                                             \
@@ -184,4 +186,8 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
         c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
         c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
     }
+
+    if (EXTERNAL_SSE4(mm_flags)) {
+        c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
+    }
 }
-- 
2.8.1.369.geae769a



More information about the ffmpeg-devel mailing list