[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

Martin Vignali martin.vignali at gmail.com
Sun Dec 3 21:31:41 EET 2017


>
> In any case, if clang or gcc can generate better code, then the hand
> written version needs to be optimized to be as fast or faster.
>
>
>
Quick test : pass checkasm (but probably only because width = 256)
hflip_byte_c: 26.4
hflip_byte_ssse3: 20.4


INIT_XMM ssse3
cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
    mova    m0, [pb_flip_byte]
    xor     xq, xq ; <======
    mov     wd, dword wm
    sub     wq, mmsize * 2
;remove the cmp here <======
    jl .skip

    .loop0: ; process two xmm in the loop
        neg     xq
        movu    m1, [srcq + xq - mmsize + 1]
        movu    m2, [srcq + xq - mmsize * 2 + 1] <======
        pshufb  m1, m0
        pshufb  m2, m0 <======
        neg     xq
        movu    [dstq + xq], m1
        movu    [dstq + xq + mmsize], m2 <======
        add     xq, mmsize * 2 <======
        cmp     xq, wq
        jl .loop0
     RET ; add RET here

; MISSING one xmm process if need

.skip:
    add     wq, mmsize
    .loop1:
        neg    xq
        mov    vb, [srcq + xq]
        neg    xq
        mov    [dstq + xq], vb
        add    xq, 1
        cmp    xq, wq
        jl .loop1
RET


Martin


More information about the ffmpeg-devel mailing list