[FFmpeg-devel] [PATCH] vp9: implement top/left half (4x4) sub-8x8-IDCT.

Mon Dec 2 07:46:21 CET 2013

On Sun, Dec 01, 2013 at 09:28:58PM -0500, Ronald S. Bultje wrote:
> For that specific case (eob>3&&eob<=12), runtime of idct8x8 goes from
> 668 to 477 cycles. For all idct8x8, runtime goes from 521 to 490 cycles.
> ---
>  libavcodec/x86/vp9itxfm.asm | 43 +++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 41 insertions(+), 2 deletions(-)
> 
> diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
> index c382bdb..0c0ee91 100644
> --- a/libavcodec/x86/vp9itxfm.asm
> +++ b/libavcodec/x86/vp9itxfm.asm
> @@ -39,7 +39,7 @@ VP9_IDCT_COEFFS %1, %2
>  
>  VP9_IDCT_COEFFS_ALL 15137,  6270
>  VP9_IDCT_COEFFS_ALL 16069,  3196
> -VP9_IDCT_COEFFS      9102, 13623
> +VP9_IDCT_COEFFS_ALL  9102, 13623
>  
>  pd_8192: times 4 dd 8192
>  pw_2048: times 8 dw 2048
> @@ -205,6 +205,26 @@ cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
>      VP9_IDCT8_1D_FINALIZE
>  %endmacro
>  
> +%macro VP9_IDCT8_1D_4 0

VP9_IDCT8_4x4_1D for consistency

> +    pmulhrsw            m0, m12                             ; m0=t1a/t0a
> +    pmulhrsw           m10, m2, [pw_15137x2]                ; m10=t3a
> +    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
> +    pmulhrsw           m11, m1, [pw_16069x2]                ; m11=t7a
> +    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
> +    pmulhrsw            m9, m3, [pw_9102x2]                 ; m9=-t5a
> +    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a

You can probably store maybe 3 of the pw_* in registers once at the
bootstrap of .idcthalf; we do that for pw_11585x2 (m12) but also pw_3196x2
(m6) and pw_16069x2 (m7) in .idcttopleftcorner.

It might require to bump from 13 to 16 xmm reg though, not sure if that
will affect performance in a good way.

> +    mova                m8, m0
> +    SUMSUB_BA            w, 10,  8, 4                       ; m10=t0a+t3a (t0),  m8=t0a-t3a (t3)
> +    SUMSUB_BA            w,  2,  0, 4                       ;  m2=t1a+t2a (t1),  m0=t1a-t2a (t2)
> +    SUMSUB_BA            w,  9,  1, 4                       ;  m1=t4a+t5a (t4),  m9=t4a-t5a (t5a)
> +    SWAP                 1,  9
> +    SUMSUB_BA            w,  3, 11, 4                       ;  m3=t7a+t6a (t7), m11=t7a-t6a (t6a)
> +    SUMSUB_BA            w,  1, 11, 4                       ;  m1=t6a+t5a (t6), m11=t6a-t5a (t5)
> +    pmulhrsw            m1, m12                             ; m1=t6
> +    pmulhrsw           m11, m12                             ; m11=t5
> +    VP9_IDCT8_1D_FINALIZE
> +%endmacro
> +
>  ; TODO: a lot of t* copies can probably be removed and merged with
>  ; following SUMSUBs from VP9_IDCT8_1D_FINALIZE with AVX
>  %macro VP9_IDCT8_2x2_1D 0
> @@ -250,9 +270,12 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
>  
>      mova               m12, [pw_11585x2]    ; often used
>  
> -    cmp eobd, 3 ; top left corner or less
> +    cmp eobd, 12 ; top left half or less
>      jg .idctfull
>  
> +    cmp eobd, 3  ; top left corner or less
> +    jg .idcthalf
> +
>      cmp eobd, 1 ; faster path for when only DC is set
>      jne .idcttopleftcorner
>  
> @@ -289,6 +312,22 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
>      VP9_IDCT8_WRITEOUT
>      RET
>  
> +.idcthalf:
> +    movh                m0, [blockq + 0]
> +    movh                m1, [blockq +16]
> +    movh                m2, [blockq +32]
> +    movh                m3, [blockq +48]
> +    VP9_IDCT8_1D_4
> +    TRANSPOSE8x8W  0, 1, 2, 3, 8, 9, 10, 11, 4
> +    VP9_IDCT8_1D_4

> +    pxor                m4, m4
> +    movq       [blockq+ 0], m4
> +    movq       [blockq+16], m4
> +    movq       [blockq+32], m4
> +    movq       [blockq+48], m4

movh is less efficient here?

> +    VP9_IDCT8_WRITEOUT
> +    RET
> +
>  .idctfull: ; generic full 8x8 idct/idct
>      mova                m0, [blockq+  0]    ; IN(0)
>      mova                m1, [blockq+ 16]    ; IN(1)

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20131202/f41acd05/attachment.asc>