[FFmpeg-devel] [PATCH 1/7] lavc/flacenc: add sse4 version of the lpc encoder

Michael Niedermayer michaelni at gmx.at
Sat Feb 15 20:44:39 CET 2014


On Sat, Feb 15, 2014 at 02:54:57AM +0100, James Darnley wrote:
> From 1.8 to 2.4 times faster.  Runtime is reduced by 2 to 39%.  The
> speed-up generally increases with compression_level.
> 
> This lpc encoder is not used with levels < 3 so it provides no speed-up
> in these cases.
> ---
>  LICENSE                         |    1 +
>  libavcodec/x86/Makefile         |    3 +
>  libavcodec/x86/flac_dsp_gpl.asm |   78 +++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/flacdsp_init.c   |    4 ++
>  4 files changed, 86 insertions(+), 0 deletions(-)
>  create mode 100644 libavcodec/x86/flac_dsp_gpl.asm
> 
> diff --git a/LICENSE b/LICENSE
> index 1f757aa..490adff 100644
> --- a/LICENSE
> +++ b/LICENSE
> @@ -16,6 +16,7 @@ Specifically, the GPL parts of FFmpeg are
>  - libmpcodecs
>  - optional x86 optimizations in the files
>    libavcodec/x86/idct_mmx.c
> +  libavcodec/x86/flac_dsp_gpl.asm
>  - libutvideo encoding/decoding wrappers in
>    libavcodec/libutvideo*.cpp
>  - the X11 grabber in libavdevice/x11grab.c
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 7b56178..2cf9d2c 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -75,6 +75,9 @@ YASM-OBJS-$(CONFIG_DSPUTIL)            += x86/dsputil.o                 \
>  YASM-OBJS-$(CONFIG_ENCODERS)           += x86/dsputilenc.o
>  YASM-OBJS-$(CONFIG_FFT)                += x86/fft.o
>  YASM-OBJS-$(CONFIG_FLAC_DECODER)       += x86/flacdsp.o
> +ifdef CONFIG_GPL
> +YASM-OBJS-$(CONFIG_FLAC_ENCODER)       += x86/flac_dsp_gpl.o
> +endif
>  YASM-OBJS-$(CONFIG_H263DSP)            += x86/h263_loopfilter.o
>  YASM-OBJS-$(CONFIG_H264CHROMA)         += x86/h264_chromamc.o           \
>                                            x86/h264_chromamc_10bit.o
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
> new file mode 100644
> index 0000000..3ce5fdf
> --- /dev/null
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -0,0 +1,78 @@
> +;*****************************************************************************
> +;* FLAC DSP functions
> +;*
> +;* Copyright (c) 2014 James Darnley <james.darnley at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License along
> +;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_TEXT
> +
> +INIT_XMM sse4
> +%if ARCH_X86_64
> +    cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs, shift
> +    %define posj r5
> +    %define negj r6
> +%else
> +    cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, len, order, coefs, shift
> +    %define posj r2
> +    %define negj r5
> +%endif
> +
> +; Is it worth looping correctly over the first samples?  The most that ever need
> +; to be copied is 32 so we might as well just unroll the loop and do all 32.
> +%assign iter 0
> +%rep 32/(mmsize/4)
> +    movu m0, [smpq+iter]
> +    movu [resq+iter], m0
> +    %assign iter iter+mmsize
> +%endrep
> +
> +lea resq, [resq+orderq*4]
> +lea smpq, [smpq+orderq*4]
> +lea coefsq, [coefsq+orderq*4]
> +sub lenmp, orderq
> +movd m3, shiftmp
> +neg orderq


[...]
> +void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);


you are mixing 32bit function arguments (int) with 64bit reading from
them in the asm
the high 32bit could be non zero and cause a crash

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Good people do not need laws to tell them to act responsibly, while bad
people will find a way around the laws. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140215/3343a111/attachment.asc>


More information about the ffmpeg-devel mailing list