[FFmpeg-devel] [PATCH 2/2] vp9: sse2/ssse3/avx 16bpp loopfilter x86 simd.

Henrik Gramner henrik at gramner.com
Wed Sep 30 17:54:43 CEST 2015


On Fri, Sep 25, 2015 at 5:25 PM, Ronald S. Bultje <rsbultje at gmail.com> wrote:
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c

> +#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
> +    dsp->loop_filter_8[idx1][idx2] = ff_loop_filter_##dir##_##wd##_##bpp##_##opt
> +#define init_lpf_16_func(idx, dir, bpp, opt) \
> +    dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
> +#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
> +    dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt

Are those defines required for macro expansion? If so, ok, otherwise
I'd drop them since they don't really simplify anything.

> diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm

> +pw_4095: times 16 dw 4095

This one already exists in vp9mc_16bpp.asm, merge them.

> +%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
> +                                      ; src/sub1, sub2, add1, add2, dont_store
> +    psrlw               %1, %2, %4
> +%ifnidn %7, ""
> +    psubw               %2, %6
> +%endif
> +    psubw               %1, %6                      ; abs->delta
> +%ifnidn %7, ""
> +    psubw               %2, %7
> +%endif
> +    pand                %1, reg_%3                  ; apply mask
> +%ifnidn %7, ""
> +    paddw               %2, %8
> +%endif
> +%if %10 == 1
> +    paddw               %6, %1                      ; delta->abs
> +%else
> +    paddw               %1, %6                      ; delta->abs
> +%endif
> +%ifnidn %7, ""
> +    paddw               %2, %9
> +%endif
> +%if %10 != 1
> +    mova              [%5], %1
> +%endif
> +%endmacro

Is there a reason for not merging most of those %ifs to make it more readable?

> +    psllw               m5, 1
[...]
> +    psllw               m5, m3, 1
[...]
> +    psllw               m5, m1, 1

Use paddw instead of left-shifting by 1.

Otherwise LGTM. Also seems like AVX2 would indeed be quite useful here
if you feel like doing that.


More information about the ffmpeg-devel mailing list