[FFmpeg-devel] [PATCHv3] lavu/x86/lls: add fma3 optimizations for update_lls
Ganesh Ajjanagadde
gajjanagadde at gmail.com
Fri Jan 15 22:48:21 CET 2016
On Thu, Jan 14, 2016 at 7:39 PM, Ganesh Ajjanagadde
<gajjanagadde at gmail.com> wrote:
> This improves accuracy (very slightly) and speed for processors having
> fma3.
>
> Sample benchmark (fate flac-16-lpc-cholesky, Haswell):
> old:
> 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips
> 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips
>
> new:
> 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips
> 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips
>
> Tested with FATE and --disable-fma3, also examined contents of
> lavu/lls-test.
>
> Reviewed-by: James Almer <jamrial at gmail.com>
> Reviewed-by: Henrik Gramner <henrik at gramner.com>
> Signed-off-by: Ganesh Ajjanagadde <gajjanagadde at gmail.com>
> ---
> libavutil/x86/lls.asm | 59 ++++++++++++++++++++++++++++++++++++++++++++++--
> libavutil/x86/lls_init.c | 4 ++++
> 2 files changed, 61 insertions(+), 2 deletions(-)
>
> diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
> index 769befb..317fba6 100644
> --- a/libavutil/x86/lls.asm
> +++ b/libavutil/x86/lls.asm
> @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
> .ret:
> REP_RET
>
> -%if HAVE_AVX_EXTERNAL
> -INIT_YMM avx
> +%macro UPDATE_LLS 0
> cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> %define covarq ctxq
> mov countd, [ctxq + LLSModel.indep_count]
> @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> vbroadcastsd ymm6, [varq + iq*8 + 16]
> vbroadcastsd ymm7, [varq + iq*8 + 24]
> vextractf128 xmm3, ymm1, 1
> +%if cpuflag(fma3)
> + mova ymm0, COVAR(iq ,0)
> + mova xmm2, COVAR(iq+2,2)
> + fmaddpd ymm0, ymm1, ymm4, ymm0
> + fmaddpd xmm2, xmm3, xmm6, xmm2
> + fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
> + fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
> + mova COVAR(iq ,0), ymm0
> + mova COVAR(iq ,1), ymm1
> + mova COVAR(iq+2,2), xmm2
> + mova COVAR(iq+2,3), xmm3
> +%else
> vmulpd ymm0, ymm1, ymm4
> vmulpd ymm1, ymm1, ymm5
> vmulpd xmm2, xmm3, xmm6
> @@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> ADDPD_MEM COVAR(iq ,1), ymm1
> ADDPD_MEM COVAR(iq+2,2), xmm2
> ADDPD_MEM COVAR(iq+2,3), xmm3
> +%endif ; cpuflag(fma3)
> lea jd, [iq + 4]
> cmp jd, count2d
> jg .skip4x4
> .loop4x4:
> ; Compute all 16 pairwise products of a 4x4 block
> mova ymm3, [varq + jq*8]
> +%if cpuflag(fma3)
> + mova ymm0, COVAR(jq, 0)
> + mova ymm1, COVAR(jq, 1)
> + mova ymm2, COVAR(jq, 2)
> + fmaddpd ymm0, ymm3, ymm4, ymm0
> + fmaddpd ymm1, ymm3, ymm5, ymm1
> + fmaddpd ymm2, ymm3, ymm6, ymm2
> + fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
> + mova COVAR(jq, 0), ymm0
> + mova COVAR(jq, 1), ymm1
> + mova COVAR(jq, 2), ymm2
> + mova COVAR(jq, 3), ymm3
> +%else
> vmulpd ymm0, ymm3, ymm4
> vmulpd ymm1, ymm3, ymm5
> vmulpd ymm2, ymm3, ymm6
> @@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> ADDPD_MEM COVAR(jq,1), ymm1
> ADDPD_MEM COVAR(jq,2), ymm2
> ADDPD_MEM COVAR(jq,3), ymm3
> +%endif ; cpuflag(fma3)
> add jd, 4
> cmp jd, count2d
> jle .loop4x4
> @@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> cmp jd, countd
> jg .skip2x4
> mova xmm3, [varq + jq*8]
> +%if cpuflag(fma3)
> + mova xmm0, COVAR(jq, 0)
> + mova xmm1, COVAR(jq, 1)
> + mova xmm2, COVAR(jq, 2)
> + fmaddpd xmm0, xmm3, xmm4, xmm0
> + fmaddpd xmm1, xmm3, xmm5, xmm1
> + fmaddpd xmm2, xmm3, xmm6, xmm2
> + fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
> + mova COVAR(jq, 0), xmm0
> + mova COVAR(jq, 1), xmm1
> + mova COVAR(jq, 2), xmm2
> + mova COVAR(jq, 3), xmm3
> +%else
> vmulpd xmm0, xmm3, xmm4
> vmulpd xmm1, xmm3, xmm5
> vmulpd xmm2, xmm3, xmm6
> @@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> ADDPD_MEM COVAR(jq,1), xmm1
> ADDPD_MEM COVAR(jq,2), xmm2
> ADDPD_MEM COVAR(jq,3), xmm3
> +%endif ; cpuflag(fma3)
> .skip2x4:
> add id, 4
> add covarq, 4*COVAR_STRIDE
> @@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> mov jd, id
> .loop2x1:
> vmovddup xmm0, [varq + iq*8]
> +%if cpuflag(fma3)
> + mova xmm1, [varq + jq*8]
> + fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
> + mova COVAR(jq,0), xmm0
> +%else
> vmulpd xmm0, [varq + jq*8]
> ADDPD_MEM COVAR(jq,0), xmm0
> +%endif ; cpuflag(fma3)
> inc id
> add covarq, COVAR_STRIDE
> cmp id, countd
> jle .loop2x1
> .ret:
> REP_RET
> +%endmacro ; UPDATE_LLS
> +
> +%if HAVE_AVX_EXTERNAL
> +INIT_YMM avx
> +UPDATE_LLS
> +%endif
> +%if HAVE_FMA3_EXTERNAL
> +INIT_YMM fma3
> +UPDATE_LLS
> %endif
>
> INIT_XMM sse2
> diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
> index 81f141c..9f0d862 100644
> --- a/libavutil/x86/lls_init.c
> +++ b/libavutil/x86/lls_init.c
> @@ -25,6 +25,7 @@
>
> void ff_update_lls_sse2(LLSModel *m, const double *var);
> void ff_update_lls_avx(LLSModel *m, const double *var);
> +void ff_update_lls_fma3(LLSModel *m, const double *var);
> double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
>
> av_cold void ff_init_lls_x86(LLSModel *m)
> @@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
> if (EXTERNAL_AVX_FAST(cpu_flags)) {
> m->update_lls = ff_update_lls_avx;
> }
> + if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
> + m->update_lls = ff_update_lls_fma3;
> + }
> }
> --
> 2.7.0
>
Pushed, reviewed by James Almer on IRC. Thanks.
More information about the ffmpeg-devel
mailing list