[FFmpeg-devel] [PATCH] x86/diracdsp: make ff_put_signed_rect_clamped_10_sse4 work on x86_32

James Almer jamrial at gmail.com
Wed Jul 20 04:40:24 EEST 2016


Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavcodec/x86/diracdsp.asm    | 37 ++++++++++++++++++++-----------------
 libavcodec/x86/diracdsp_init.c |  4 ----
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index d86b543..6b3f780 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -303,24 +303,30 @@ cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
 
     RET
 
-%if ARCH_X86_64 == 1
+INIT_XMM sse4
 ; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
-cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
-    mov      r6, srcq
-    mov      r7, dstq
-    mov      r8, wq
+%if ARCH_X86_64
+cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
+%else
+cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
+    %define  hd  r5mp
+%endif
+    shl      wd, 2
+    add    srcq, wq
+    neg      wq
+    mov     t2q, dstq
+    mov     t1q, wq
     pxor     m2, m2
     mova     m3, [clip_10bit]
     mova     m4, [convert_to_unsigned_10bit]
 
     .loop_h:
-    mov      srcq, r6
-    mov      dstq, r7
-    mov      wq,   r8
+    mov    dstq, t2q
+    mov      wq, t1q
 
     .loop_w:
-    movu     m0, [srcq+0*mmsize]
-    movu     m1, [srcq+1*mmsize]
+    movu     m0, [srcq+wq+0*mmsize]
+    movu     m1, [srcq+wq+1*mmsize]
 
     paddd    m0, m4
     paddd    m1, m4
@@ -329,16 +335,13 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w
 
     movu     [dstq], m0
 
-    add      srcq, 2*mmsize
     add      dstq, 1*mmsize
-    sub      wd, 8
-    jg       .loop_w
+    add      wq,   2*mmsize
+    jl       .loop_w
 
-    add      r6, src_strideq
-    add      r7, dst_strideq
+    add    srcq, src_strideq
+    add     t2q, dst_strideq
     sub      hd, 1
     jg       .loop_h
 
     RET
-
-%endif
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index d7c7cd1..b195113 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -45,9 +45,7 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, i
 void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
-#if ARCH_X86_64
 void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
-#endif
 
 void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
 
@@ -192,8 +190,6 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
 
     if (EXTERNAL_SSE4(mm_flags)) {
         c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
-#if ARCH_X86_64
         c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
-#endif
     }
 }
-- 
2.9.1



More information about the ffmpeg-devel mailing list