[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

Martin Vignali martin.vignali at gmail.com
Sun Dec 3 23:16:24 EET 2017


Checkasm result (osx) for your last patch :
hflip_byte_c: 28.5
hflip_byte_ssse3: 29.0
hflip_short_c: 277.7
hflip_short_ssse3: 65.0

if you add a "cmp xq, wq" after the simd loop
you can be faster than c (clang), if width is multiple of mmsize*2

hflip_byte_c: 28.5
hflip_byte_ssse3: 27.5

see below


otherwise looks ok (i will send later a much cleaner patch for the checkasm,
and a patch to use one macro for both func)

+
> +pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
> +pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
> +
> +SECTION .text
> +
> +INIT_XMM ssse3
> +cglobal hflip_byte, 3, 6, 3, src, dst, w, x, v, r
>
+    mova    m0, [pb_flip_byte]
> +    mov     xq, 0
> +    mov     wd, dword wm
> +    mov     rq, wq
> +    and     rq, 2 * mmsize - 1
> +    cmp     wq, 2 * mmsize
> +    jl .loop1
> +    sub     wq, rq
> +
> +    .loop0:
> +        neg     xq
> +        movu    m1, [srcq + xq -     mmsize + 1]
> +        movu    m2, [srcq + xq - 2 * mmsize + 1]
> +        pshufb  m1, m0
> +        pshufb  m2, m0
> +        neg     xq
> +        movu    [dstq + xq         ], m1
> +        movu    [dstq + xq + mmsize], m2
> +        add     xq, mmsize * 2
> +        cmp     xq, wq
> +        jl .loop0
>

cmp xq, wq
je .end


> +
> +        add    wq, rq
> +
> +    .loop1:
> +        neg    xq
> +        mov    vb, [srcq + xq]
> +        neg    xq
> +        mov    [dstq + xq], vb
> +        add    xq, 1
> +        cmp    xq, wq
> +        jl .loop1
>

.end:


> +RET
>


More information about the ffmpeg-devel mailing list