[FFmpeg-devel] [PATCH] lavu/x86/lls: add fma3 optimizations for update_lls
Ganesh Ajjanagadde
gajjanagadde at gmail.com
Thu Jan 14 01:03:53 CET 2016
On Wed, Jan 13, 2016 at 6:59 PM, Ganesh Ajjanagadde
<gajjanagadde at gmail.com> wrote:
> This improves accuracy (very slightly) and speed for processors having
> fma3.
>
> Sample benchmark (fate flac-16-lpc-cholesky, Haswell):
> old:
> 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips
> 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips
>
> new:
> 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips
> 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips
>
> Tested with FATE and --disable-fma3, also examined contents of
> lavu/lls-test.
>
> Signed-off-by: Ganesh Ajjanagadde <gajjanagadde at gmail.com>
> ---
> libavutil/x86/lls.asm | 61 ++++++++++++++++++++++++++++++++++++++++++++++--
> libavutil/x86/lls_init.c | 4 ++++
> 2 files changed, 63 insertions(+), 2 deletions(-)
>
> diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
> index 769befb..358603a 100644
> --- a/libavutil/x86/lls.asm
> +++ b/libavutil/x86/lls.asm
> @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
> .ret:
> REP_RET
>
> -%if HAVE_AVX_EXTERNAL
> -INIT_YMM avx
> +%macro UPDATE_LLS 0
> cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> %define covarq ctxq
> mov countd, [ctxq + LLSModel.indep_count]
> @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> vbroadcastsd ymm6, [varq + iq*8 + 16]
> vbroadcastsd ymm7, [varq + iq*8 + 24]
> vextractf128 xmm3, ymm1, 1
> +%if cpuflag(fma3)
> + mova ymm0, COVAR(iq ,0)
> + mova xmm2, COVAR(iq+2,2)
> + vfmadd231pd ymm0, ymm1, ymm4
> + vfmadd231pd xmm2, xmm3, xmm6
> + vfmadd213pd ymm1, ymm5, COVAR(iq ,1)
> + vfmadd213pd xmm3, xmm7, COVAR(iq+2,3)
> + mova COVAR(iq ,0), ymm0
> + mova COVAR(iq ,1), ymm1
> + mova COVAR(iq+2,2), xmm2
> + mova COVAR(iq+2,3), xmm3
> +%else
> vmulpd ymm0, ymm1, ymm4
> vmulpd ymm1, ymm1, ymm5
> vmulpd xmm2, xmm3, xmm6
> @@ -148,12 +159,27 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> ADDPD_MEM COVAR(iq ,1), ymm1
> ADDPD_MEM COVAR(iq+2,2), xmm2
> ADDPD_MEM COVAR(iq+2,3), xmm3
> +%endif ; cpuflag(fma3)
> lea jd, [iq + 4]
> cmp jd, count2d
> jg .skip4x4
> .loop4x4:
> ; Compute all 16 pairwise products of a 4x4 block
> mova ymm3, [varq + jq*8]
> +%if cpuflag(fma3)
> + mova ymm0, COVAR(jq, 0)
> + mova ymm1, COVAR(jq, 1)
> + mova ymm2, COVAR(jq, 2)
> + mova ymm3, COVAR(jq, 3)
> + vfmadd231pd ymm0, ymm3, ymm4
> + vfmadd231pd ymm1, ymm3, ymm5
> + vfmadd231pd ymm2, ymm3, ymm6
> + vfmadd231pd ymm3, ymm3, ymm7
> + mova COVAR(jq, 0), ymm0
> + mova COVAR(jq, 1), ymm1
> + mova COVAR(jq, 2), ymm2
> + mova COVAR(jq, 3), ymm3
> +%else
> vmulpd ymm0, ymm3, ymm4
> vmulpd ymm1, ymm3, ymm5
> vmulpd ymm2, ymm3, ymm6
> @@ -162,6 +188,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> ADDPD_MEM COVAR(jq,1), ymm1
> ADDPD_MEM COVAR(jq,2), ymm2
> ADDPD_MEM COVAR(jq,3), ymm3
> +%endif ; cpuflag(fma3)
> add jd, 4
> cmp jd, count2d
> jle .loop4x4
> @@ -169,6 +196,20 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> cmp jd, countd
> jg .skip2x4
> mova xmm3, [varq + jq*8]
> +%if cpuflag(fma3)
> + mova xmm0, COVAR(jq, 0)
> + mova xmm1, COVAR(jq, 1)
> + mova xmm2, COVAR(jq, 2)
> + mova xmm3, COVAR(jq, 3)
> + vfmadd231pd xmm0, xmm3, xmm4
> + vfmadd231pd xmm1, xmm3, xmm5
> + vfmadd231pd xmm2, xmm3, xmm6
> + vfmadd231pd xmm3, xmm3, xmm7
> + mova COVAR(jq, 0), xmm0
> + mova COVAR(jq, 1), xmm1
> + mova COVAR(jq, 2), xmm2
> + mova COVAR(jq, 3), xmm3
> +%else
> vmulpd xmm0, xmm3, xmm4
> vmulpd xmm1, xmm3, xmm5
> vmulpd xmm2, xmm3, xmm6
> @@ -177,6 +218,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> ADDPD_MEM COVAR(jq,1), xmm1
> ADDPD_MEM COVAR(jq,2), xmm2
> ADDPD_MEM COVAR(jq,3), xmm3
> +%endif ; cpuflag(fma3)
> .skip2x4:
> add id, 4
> add covarq, 4*COVAR_STRIDE
> @@ -187,14 +229,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
> mov jd, id
> .loop2x1:
> vmovddup xmm0, [varq + iq*8]
> +%if cpuflag(fma3)
> + mova xmm1, [varq + jq*8]
> + vfmadd213pd xmm0, xmm1, COVAR(jq,0)
> + mova COVAR(jq,0), xmm0
> +%else
> vmulpd xmm0, [varq + jq*8]
> ADDPD_MEM COVAR(jq,0), xmm0
> +%endif ; cpuflag(fma3)
> inc id
> add covarq, COVAR_STRIDE
> cmp id, countd
> jle .loop2x1
> .ret:
> REP_RET
> +%endmacro ; UPDATE_LLS
> +
> +%if HAVE_AVX_EXTERNAL
> +INIT_YMM avx
> +UPDATE_LLS
> +%endif
> +%if HAVE_FMA3_EXTERNAL
> +INIT_YMM fma3
> +UPDATE_LLS
> %endif
>
> INIT_XMM sse2
> diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
> index 81f141c..9f0d862 100644
> --- a/libavutil/x86/lls_init.c
> +++ b/libavutil/x86/lls_init.c
> @@ -25,6 +25,7 @@
>
> void ff_update_lls_sse2(LLSModel *m, const double *var);
> void ff_update_lls_avx(LLSModel *m, const double *var);
> +void ff_update_lls_fma3(LLSModel *m, const double *var);
> double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
>
> av_cold void ff_init_lls_x86(LLSModel *m)
> @@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
> if (EXTERNAL_AVX_FAST(cpu_flags)) {
> m->update_lls = ff_update_lls_avx;
> }
> + if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
> + m->update_lls = ff_update_lls_fma3;
> + }
> }
> --
> 2.7.0
>
Should mention one thing: rank one updates of the Cholesky are likely
not that useful, since I examined and found ~ 4000 update calls for 1
solve call, for the fate-flac entry. I want to add this is a comment
to the update_lls function, so that in future I or someone thinking on
those lines is aware that it is better to not do rank one updates of
Cholesky. Can add to this or separate patch, whatever people like.
More information about the ffmpeg-devel
mailing list