[FFmpeg-devel] [PATCH] avfilter/vf_stereo3d: add x86 SIMD for anaglyph outputs

James Almer jamrial at gmail.com
Mon Oct 5 18:21:13 CEST 2015


On 10/5/2015 6:49 AM, Paul B Mahol wrote:
> diff --git a/libavfilter/x86/vf_stereo3d.asm b/libavfilter/x86/vf_stereo3d.asm
> new file mode 100644
> index 0000000..269004b
> --- /dev/null
> +++ b/libavfilter/x86/vf_stereo3d.asm
> @@ -0,0 +1,184 @@
> +;*****************************************************************************
> +;* x86-optimized functions for stereo3d filter
> +;*
> +;* Copyright (C) 2015 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;*****************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +%if ARCH_X86_64
> +
> +SECTION_RODATA
> +
> +; rgbrgbrgbrgb
> +; rrrrggggbbbb
> +
> +shuf: db 0, 4, 8, 1,5, 9, 2, 6,10,3, 7,11,-1,-1,-1,-1
> +ex_r: db 0,-1,-1,-1,3,-1,-1,-1,6,-1,-1,-1, 9,-1,-1,-1
> +ex_g: db 1,-1,-1,-1,4,-1,-1,-1,7,-1,-1,-1,10,-1,-1,-1
> +ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1
> +
> +SECTION .text
> +
> +INIT_XMM sse4
> +cglobal anaglyph, 11, 13, 16, 3*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, ana_matrix_r, ana_matrix_g, ana_matrix_b
> +    movd                 m10, [ana_matrix_rq+ 0]
> +    movd                 m11, [ana_matrix_rq+ 4]
> +    movd                 m12, [ana_matrix_rq+ 8]
> +    movd                 m13, [ana_matrix_rq+12]
> +    movd                 m14, [ana_matrix_rq+16]
> +    movd                 m15, [ana_matrix_rq+20]
> +    pshufd               m10, m10, q0000
> +    pshufd               m11, m11, q0000
> +    pshufd               m12, m12, q0000
> +    pshufd               m13, m13, q0000
> +    pshufd               m14, m14, q0000
> +    pshufd               m15, m15, q0000

mova                 m13, [ana_matrix_rq +  0]
movq                 m15, [ana_matrix_rq + 16]
pshufd               m10, m13, q0000
pshufd               m11, m13, q1111
pshufd               m12, m13, q2222
pshufd               m13, m13, q3333
pshufd               m14, m15, q0000
pshufd               m15, m15, q1111

Will probably be faster.

Also, you're not using m7 anywhere, and m13, m14 and m15 remain
unused after the init code. You could keep four of the coeffs in
them instead of using stack.



More information about the ffmpeg-devel mailing list