[FFmpeg-devel] [PATCH] x86/diracdsp: make ff_put_signed_rect_clamped_10_sse4 work on x86_32

Rostislav Pehlivanov atomnuker at gmail.com
Wed Jul 20 19:28:42 EEST 2016


On 20 July 2016 at 02:40, James Almer <jamrial at gmail.com> wrote:

> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
>  libavcodec/x86/diracdsp.asm    | 37 ++++++++++++++++++++-----------------
>  libavcodec/x86/diracdsp_init.c |  4 ----
>  2 files changed, 20 insertions(+), 21 deletions(-)
>
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index d86b543..6b3f780 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -303,24 +303,30 @@ cglobal dequant_subband_32, 7, 7, 4, src, dst,
> stride, qf, qs, tot_v, tot_h
>
>      RET
>
> -%if ARCH_X86_64 == 1
> +INIT_XMM sse4
>  ; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const
> uint8_t *src, int src_stride, int width, int height)
> -cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src,
> src_stride, w, h
> -    mov      r6, srcq
> -    mov      r7, dstq
> -    mov      r8, wq
> +%if ARCH_X86_64
> +cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src,
> src_stride, w, h, t1, t2
> +%else
> +cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src,
> src_stride, w, t1, t2
> +    %define  hd  r5mp
> +%endif
> +    shl      wd, 2
> +    add    srcq, wq
> +    neg      wq
> +    mov     t2q, dstq
> +    mov     t1q, wq
>      pxor     m2, m2
>      mova     m3, [clip_10bit]
>      mova     m4, [convert_to_unsigned_10bit]
>
>      .loop_h:
> -    mov      srcq, r6
> -    mov      dstq, r7
> -    mov      wq,   r8
> +    mov    dstq, t2q
> +    mov      wq, t1q
>
>      .loop_w:
> -    movu     m0, [srcq+0*mmsize]
> -    movu     m1, [srcq+1*mmsize]
> +    movu     m0, [srcq+wq+0*mmsize]
> +    movu     m1, [srcq+wq+1*mmsize]
>
>      paddd    m0, m4
>      paddd    m1, m4
> @@ -329,16 +335,13 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst,
> dst_stride, src, src_stride, w
>
>      movu     [dstq], m0
>
> -    add      srcq, 2*mmsize
>      add      dstq, 1*mmsize
> -    sub      wd, 8
> -    jg       .loop_w
> +    add      wq,   2*mmsize
> +    jl       .loop_w
>
> -    add      r6, src_strideq
> -    add      r7, dst_strideq
> +    add    srcq, src_strideq
> +    add     t2q, dst_strideq
>      sub      hd, 1
>      jg       .loop_h
>
>      RET
> -
> -%endif
> diff --git a/libavcodec/x86/diracdsp_init.c
> b/libavcodec/x86/diracdsp_init.c
> index d7c7cd1..b195113 100644
> --- a/libavcodec/x86/diracdsp_init.c
> +++ b/libavcodec/x86/diracdsp_init.c
> @@ -45,9 +45,7 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int
> dst_stride, const int16_t *src, i
>  void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t
> *src, int src_stride, int width, int height);
>  void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const
> int16_t *src, int src_stride, int width, int height);
>  void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const
> int16_t *src, int src_stride, int width, int height);
> -#if ARCH_X86_64
>  void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> const uint8_t *src, int src_stride, int width, int height);
> -#endif
>
>  void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int tot_v, int tot_h);
>
> @@ -192,8 +190,6 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
>
>      if (EXTERNAL_SSE4(mm_flags)) {
>          c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
> -#if ARCH_X86_64
>          c->put_signed_rect_clamped[1] =
> ff_put_signed_rect_clamped_10_sse4;
> -#endif
>      }
>  }
> --
> 2.9.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

Very nice, thanks.
Push whenever you have the time


More information about the ffmpeg-devel mailing list