[FFmpeg-devel] [PATCH 01/10] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped

Fri Jun 24 15:43:00 CEST 2016

On Fri, Jun 24, 2016 at 12:44:21PM +0100, Rostislav Pehlivanov wrote:
> On 23 June 2016 at 20:57, James Almer <jamrial at gmail.com> wrote:
> 
> > On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote:
> > > Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> > > ---
> > >  libavcodec/x86/diracdsp.asm    | 47
> > ++++++++++++++++++++++++++++++++++++++++++
> > >  libavcodec/x86/diracdsp_init.c |  6 ++++++
> > >  2 files changed, 53 insertions(+)
> > >
> > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > > index a042413..9db7b67 100644
> > > --- a/libavcodec/x86/diracdsp.asm
> > > +++ b/libavcodec/x86/diracdsp.asm
> > > @@ -22,6 +22,8 @@
> > >
> > >  SECTION_RODATA
> > >  pw_7: times 8 dw 7
> > > +convert_to_unsigned_10bit: times 4 dd 0x200
> > > +clip_10bit:                times 8 dw 0x3ff
> > >
> > >  cextern pw_3
> > >  cextern pw_16
> > > @@ -172,6 +174,48 @@ cglobal put_signed_rect_clamped_%1, 5,9,3, dst,
> > dst_stride, src, src_stride, w,
> > >      RET
> > >  %endm
> > >
> > > +%macro PUT_RECT_10 0
> > > +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const
> > uint8_t *src, int src_stride, int width, int height)
> > > +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src,
> > src_stride, w, h
> >
> > This is x86_64 only. Either add the relevant pre-processor checks here
> > and to the init file, or make the necessary changes to make it work
> > on x86_32.
> > Look at the 8bit version of put_signed_rect_clamped for an example of
> > how to deal with this using stack.
> >
> > > +
> > > +    neg      wq
> > > +    neg      hq
> >
> > Why? You're not using these as part of effective addresses, just as
> > counters. Keep them as is and just do sub instead of add in the loops
> > below.
> > For that matter, you'd need to sign extend these with movsxd before
> > negating them, or change the prototype and make them ptrdiff_t instead
> > of int.
> >
> > > +    mov      r6, srcq
> > > +    mov      r7, dstq
> > > +    mov      r8, wq
> > > +    pxor     m2, m2
> > > +    mova     m3, [clip_10bit]
> > > +    mova     m4, [convert_to_unsigned_10bit]
> > > +
> > > +    .loop_h:
> > > +    mov      srcq, r6
> > > +    mov      dstq, r7
> > > +    mov      wq,   r8
> > > +
> > > +    .loop_w:
> > > +    movu     m0, [srcq+0*mmsize]
> > > +    movu     m1, [srcq+1*mmsize]
> > > +
> > > +    paddd    m0, m4
> > > +    paddd    m1, m4
> > > +    packusdw m0, m0, m1
> > > +    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
> >
> > Would be nice if you could make this work with SSE2 as well.
> > There are some examples of packusdw SSE2 emulation in the codebase.
> >
> > > +
> > > +    movu     [dstq], m0
> > > +
> > > +    add      srcq, 2*mmsize
> > > +    add      dstq, 1*mmsize
> > > +    add      wq, 8
> > > +    jl       .loop_w
> > > +
> > > +    add      r6, src_strideq
> > > +    add      r7, dst_strideq
> > > +    add      hq, 1
> >
> > Make sure to do "sub wd, 8" and "sub hd, 1" after removing the above
> > negs if don't change the prototype.
> >
> > > +    jl       .loop_h
> > > +
> > > +    RET
> > > +%endm
> > > +
> > >  %macro ADD_RECT 1
> > >  ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride,
> > int16_t *idwt, int idwt_stride, int width, int height)
> > >  cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt,
> > idwt_stride, w, h
> > > @@ -263,3 +307,6 @@ ADD_RECT sse2
> > >  HPEL_FILTER sse2
> > >  ADD_OBMC 32, sse2
> > >  ADD_OBMC 16, sse2
> > > +
> > > +INIT_XMM sse4
> > > +PUT_RECT_10
> >
> > No need to make it a macro if it's going to be a single version.
> > If you add a SSE2 one then this would makes sense.
> >
> > > diff --git a/libavcodec/x86/diracdsp_init.c
> > b/libavcodec/x86/diracdsp_init.c
> > > index 5fae798..4786eea 100644
> > > --- a/libavcodec/x86/diracdsp_init.c
> > > +++ b/libavcodec/x86/diracdsp_init.c
> > > @@ -46,6 +46,8 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int
> > dst_stride, const int16_t *src,
> > >  void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const
> > int16_t *src, int src_stride, int width, int height);
> > >  void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride,
> > const int16_t *src, int src_stride, int width, int height);
> > >
> > > +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> > const uint8_t *src, int src_stride, int width, int height);
> > > +
> > >  #if HAVE_YASM
> > >
> > >  #define HPEL_FILTER(MMSIZE, EXT)
> >                      \
> > > @@ -184,4 +186,8 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> > >          c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
> > >          c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
> > >      }
> > > +
> > > +    if (EXTERNAL_SSE4(mm_flags)) {
> > > +        c->put_signed_rect_clamped[1] =
> > ff_put_signed_rect_clamped_10_sse4;
> > > +    }
> > >  }
> > >
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel at ffmpeg.org
> > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> 
> Like with the dequant asm, I've attached an amended patch
> 
> Thanks

>  diracdsp.asm    |   45 +++++++++++++++++++++++++++++++++++++++++++++
>  diracdsp_init.c |   10 ++++++++++
>  2 files changed, 55 insertions(+)
> a21d5ad533c3fcaee015ab08ab01af8024be81ae  0001-diracdsp-add-SIMD-for-the-10-bit-version-of-put_sign.patch
> From 86ecebfe70509329d6f5b8a587ae79d19f9c8154 Mon Sep 17 00:00:00 2001
> From: Rostislav Pehlivanov <rpehlivanov at ob-encoder.com>
> Date: Thu, 23 Jun 2016 18:06:55 +0100
> Subject: [PATCH 1/2] diracdsp: add SIMD for the 10 bit version of
>  put_signed_rect_clamped

fails fate (file looks very green when viewed)

make -j12 fate-vsynth1-vc2-420p10
TEST    vsynth1-vc2-420p10

--- ./tests/ref/vsynth/vsynth1-vc2-420p10       2016-06-23 23:46:59.519404425 +0200
+++ tests/data/fate/vsynth1-vc2-420p10  2016-06-24 15:39:08.840607983 +0200
@@ -1,4 +1,4 @@
 1365742985b6315f6796c765aa17f39e *tests/data/fate/vsynth1-vc2-420p10.mov
 1417047 tests/data/fate/vsynth1-vc2-420p10.mov
-d3deedfa461a2696f82910890412fa2d *tests/data/fate/vsynth1-vc2-420p10.out.rawvideo
-stddev:    0.60 PSNR: 52.47 MAXDIFF:    1 bytes:  7603200/   760320
+fc2ec1af7f07e8c10701960a12a67530 *tests/data/fate/vsynth1-vc2-420p10.out.rawvideo
+stddev:  133.96 PSNR:  5.59 MAXDIFF:  254 bytes:  7603200/   760320
Test vsynth1-vc2-420p10 failed. Look at tests/data/fate/vsynth1-vc2-420p10.err for details.
make: *** [fate-vsynth1-vc2-420p10] Error 1

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

There will always be a question for which you do not know the correct answer.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160624/4cb4db2a/attachment.sig>