[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Martin Vignali
martin.vignali at gmail.com
Sun Dec 3 21:31:41 EET 2017
>
> In any case, if clang or gcc can generate better code, then the hand
> written version needs to be optimized to be as fast or faster.
>
>
>
Quick test : pass checkasm (but probably only because width = 256)
hflip_byte_c: 26.4
hflip_byte_ssse3: 20.4
INIT_XMM ssse3
cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
mova m0, [pb_flip_byte]
xor xq, xq ; <======
mov wd, dword wm
sub wq, mmsize * 2
;remove the cmp here <======
jl .skip
.loop0: ; process two xmm in the loop
neg xq
movu m1, [srcq + xq - mmsize + 1]
movu m2, [srcq + xq - mmsize * 2 + 1] <======
pshufb m1, m0
pshufb m2, m0 <======
neg xq
movu [dstq + xq], m1
movu [dstq + xq + mmsize], m2 <======
add xq, mmsize * 2 <======
cmp xq, wq
jl .loop0
RET ; add RET here
; MISSING one xmm process if need
.skip:
add wq, mmsize
.loop1:
neg xq
mov vb, [srcq + xq]
neg xq
mov [dstq + xq], vb
add xq, 1
cmp xq, wq
jl .loop1
RET
Martin
More information about the ffmpeg-devel
mailing list