[FFmpeg-devel] [PATCH] avcodec/takdec: add x86 SIMD for rest of decorrelation modes

James Almer jamrial at gmail.com
Tue Oct 6 00:00:08 CEST 2015


On 10/5/2015 6:34 PM, Paul B Mahol wrote:
> diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
> new file mode 100644
> index 0000000..0158d4d
> --- /dev/null
> +++ b/libavcodec/x86/takdsp.asm
> @@ -0,0 +1,94 @@
> +;******************************************************************************
> +;* TAK DSP SIMD optimizations
> +;*
> +;* Copyright (C) 2015 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pd_128: dd 128
> +
> +SECTION .text
> +
> +INIT_XMM sse2
> +cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
> +    .loop:
> +    mova                 m0, [p1q]
> +    mova                 m1, [p2q]
> +    paddd                m0, m1

paddd m0, [p2q]

> +    mova              [p2q], m0
> +    add                 p1q, mmsize
> +    add                 p2q, mmsize
> +    sub             lengthd, mmsize/4

Do the neg trick Hendrik told you about for the maskedmerge filter. That
way you will only need to do an add on the length register per loop.
Also, if the buffer is properly padded you could do 32 bytes at a time
instead of 16.

Same applies to the other functions.

> +    jg .loop
> +    REP_RET
> +
> +cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
> +    .loop:
> +    mova                 m0, [p1q]
> +    mova                 m1, [p2q]
> +    psubd                m1, m0
> +    mova              [p1q], m0
> +    add                 p1q, mmsize
> +    add                 p2q, mmsize
> +    sub             lengthd, mmsize/4
> +    jg .loop
> +    REP_RET
> +
> +cglobal tak_decorrelate_sm, 3, 3, 3, p1, p2, length
> +    .loop:
> +    mova                 m0, [p1q]
> +    mova                 m1, [p2q]
> +    mova                 m2, m1
> +    psrld                m2, 1
> +    psubd                m0, m2
> +    paddd                m1, m0
> +    mova              [p1q], m0
> +    mova              [p2q], m1
> +    add                 p1q, mmsize
> +    add                 p2q, mmsize
> +    sub             lengthd, mmsize/4
> +    jg .loop
> +    REP_RET
> +
> +INIT_XMM sse4
> +cglobal tak_decorrelate_sf, 5, 5, 5, p1, p2, length, dshift, dfactor
> +    movd                 m2, dshiftm
> +    movd                 m3, dfactorm

Change the cglobal line to 3, 3, 5. On x86_32 it will prevent the
unnecessary load of the last two arguments on gprs.

> +    pshufd               m3, m3, 0
> +    movd                 m4, [pd_128]

Change the pd_128 constant in Rodata to "times 4 dd 128" then just
do a mova m4, [pd_128]. It will save you the pshufd below.

> +    pshufd               m4, m4, 0
> +
> +    .loop:
> +    mova                 m0, [p1q]
> +    mova                 m1, [p2q]
> +    psrld                m1, m2
> +    pmulld               m1, m3
> +    paddd                m1, m4
> +    psrld                m1, 8
> +    pslld                m1, m2
> +    psubd                m1, m0
> +    mova              [p1q], m1
> +    add                 p1q, mmsize
> +    add                 p2q, mmsize
> +    sub             lengthd, mmsize/4
> +    jg .loop
> +    REP_RET



More information about the ffmpeg-devel mailing list