[FFmpeg-devel] [PATCH]v6 Opus Pyramid Vector Quantization Search in x86 SIMD asm

Mon Jul 31 19:10:40 EEST 2017

On Wed, Jul 26, 2017 at 4:56 PM, Ivan Kalvachev <ikalvachev at gmail.com> wrote:
> +++ b/libavcodec/x86/opus_pvq_search.asm

Generic minor stuff:

Use rN instead of rNq for numbered registers (q suffix is used for
named args only due to preprocessor limitations).

Use the same "standard" vertical alignment rules as most existing
code, e.g. instructions indented by 4 spaces and operands aligned on
the first comma.

Use xmN instead of xmmN (only really makes a difference when SWAP:s
are used, but might as well do it "correctly").

Use 32-bit operands, e.g. rNd, when 64-bit math isn't required.

Unless aligning every single loop entry helps a lot I'd avoid it since
it does waste a bit of icache.

Explicitly using the carry flag as a branch condition is a bit weird.
Generally using jg/jl/jge/jle tends to be clearer.

> +%ifdef __NASM_VER__
> +%use "smartalign"
> +ALIGNMODE p6
> +%endif

Assembler-specific ifdeffery should be avoided in individual files.
Something equivalent already exists in x86inc actually but I don't
remember if it was merged to FFmpeg from upstream (x264) yet.

> +const_int32_offsets:
> +                        %rep 8
> +                                dd $-const_int32_offsets
> +                        %endrep

Isn't this just an overly complicated way of expressing "dd 0*4, 1*4,
2*4, 3*4, 4*4, 5*4, 6*4, 7*4"?

> +SECTION .text
> +
> +
> +
> +

Reduce some of the excessive whitespace.

> +%macro HSUMPS 2 ; dst/src, tmp
> +%if cpuflag(avx)
> +  %if sizeof%1>=32  ; avx
> +       vperm2f128   %2,   %1,   %1,   (0)*16+(1)   ; %2=lo(%1)<<128+hi(%1)
> +       vaddps       %1,   %2
> +  %endif
> +       vshufps      %2,   %1,   %1,   q1032
> +       vaddps       %1,   %2
> +       vshufps      %2,   %1,   %1,   q0321
> +       vaddps       %1,   %2
> +
> +%else  ; this form is a bit faster than the short avx-like emulation.
> +        movaps      %2,   %1            ;    [d,       c,       b,       a       ]
> +        shufps      %1,   %1,   q1032   ; %2=[b,       a,       d,       c       ]
> +        addps       %1,   %2            ; %1=[b+d,     a+c,     d+b,     c+a     ]
> +        movaps      %2,   %1
> +        shufps      %1,   %1,   q0321   ; %2=[c+a,     b+d,     a+c,     d+b     ]
> +        addps       %1,   %2            ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ]
> +        ; all %1 members should be equal for as long as float a+b==b+a
> +%endif
> +%endmacro

Is reordering moves for the non-AVX path worth the additional
complexity? Microoptimizations that only affect legacy hardware are
IMO a bit questionable.

> +%macro EMU_pblendvb 3 ; dst/src_a, src_b, mask
> +%if cpuflag(avx)
> +  %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
> +        %error AVX1 does not support integer 256 bit ymm operations
> +  %endif
> +
> +       vpblendvb    %1,   %1,   %2,   %3
> +;-------------------------
> +%elif cpuflag(sse4)
> +    %ifnidn %3,xmm0
> +      %error sse41 blendvps uses xmm0 as default 3 operand, you used %3
> +    %endif
> +        pblendvb    %1,   %2,   %3
> +;-------------------------
> +%else
> +        pxor        %2,   %1
> +        pand        %2,   %3
> +        pxor        %1,   %2
> +%endif
> +%endmacro

The AVX1 integer warning is kind of silly and doesn't really serve any purpose.

> +%macro EMU_broadcastss 2 ; dst, src
> +%if cpuflag(avx2)
> +       vbroadcastss %1,   %2            ; ymm, xmm
> +;-------------------------
> +%elif cpuflag(avx)
> +  %ifnum sizeof%2       ; avx1 register
> +       vpermilps    xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm
> +    %if sizeof%1 >= 32  ; mmsize>=32
> +       vinsertf128  %1,   %1,xmm%1,   1 ; ymm, ymm, xmm, im
> +    %endif
> +  %else                 ; avx1 memory
> +       vbroadcastss %1,   %2            ; ymm, mm32 || xmm, mm32
> +  %endif
> +;-------------------------
> +%else
> +  %ifnum sizeof%2       ; sse register
> +        shufps      %1,   %2,   %2,   q0000
> +  %else                 ; sse memory
> +        movss       %1,   %2            ; this zeroes the other 3 elements
> +        shufps      %1,   %1,   0
> +  %endif
> +%endif
> +%endmacro

Use the existing x86util VBROADCASTSS macro. Modify it if it's lacking
something.

+%macro EMU_pbroadcastd 2 ; dst, src
+%if cpuflag(avx2)
+       vpbroadcastd %1,   %2
+%elif cpuflag(avx) && mmsize >= 32
+        %error AVX1 does not support integer 256 bit ymm operations
+%else
+  %ifnum sizeof%2       ; sse2 register
+        pshufd      %1,   %2,   q0000
+  %else                 ; sse memory
+        movd        %1,   %2            ; movd zero extends
+        pshufd      %1,   %1,   0
+  %endif
+%endif
+%endmacro

Same as above, but using the SPLATD macro.

> +; Merge parallel maximums final round (1 vs 1)
> +        movaps      xmm0, xmm3          ; m0 = m3[0] = p[0]
> +        shufps      xmm3, xmm3, q1111   ; m3 = m3[1] = p[1]
> +        cmpless     xmm0, xmm3

Use 3-arg instead of reg-reg movaps.

Use the actual cmpss instruction instead of psuedo-instructions.
x86inc doesn't support automatic VEX handling for every possible
psuedo-instruction (mainly because there's so many of them) which will
result in AVX state transitions if INIT_YMM is used.

> +%macro SET_PIC_BASE 3; reg, const_label
> +%ifdef PIC
> +        %{1}        %2,   [%3]    ; lea r5q, [rip+const]
> +        %define     pic_base_%3 %2
> +%else
> +        %define     pic_base_%3 %3
> +%endif
> +%endmacro
[...]
> +        SET_PIC_BASE lea, r5q, const_align_abs_edge    ; rip+const
> +        movups      m3,   [pic_base_const_align_abs_edge + r4q - mmsize]

This macro is only ever used once, remove and inline it to reduce complexity.

> +%if num_mmregs > 8
> +%define num_hireg_used 3
> +%endif

IIRC the number of used mmregs is ignored in 32-bit mode so I believe
you can unconditionally specify 11.

> +        lea         r4q,  [Nq - mmsize] ; Nq is rounded up (aligned up) to mmsize, so r4q can't become negative here, unless N=0.
> +        movups      m2,   [inXq + r4q]
> +        andps       m2,   m3
> +        movaps      [tmpX + r4q], m2
> +        movaps      m1,   m2            ; Sx

Redundant reg-reg movaps, use m1 directly instead of using m2 at all.

> +  %if PRESEARCH_ROUNDING == 0
> +        cvttps2dq   m2,   m2            ; yt   = (int)truncf( res*X[i] )
> +  %else
> +        cvtps2dq    m2,   m2            ; yt   = (int)lrintf( res*X[i] )
> +  %endif
> +        paddd       m5,   m2            ; Sy  += yt
> +        cvtdq2ps    m2,   m2            ; yt   = (float)yt

Would using roundps instead of doing float-int-float conversion be beneficial?

> +        mulps       m1,   m2            ; m1   = X[i]*yt
> +        movaps      [tmpY + r4q], m2    ; y[i] = m2
> +        addps       m7,   m1            ; Sxy += m1;
> +        mulps       m2,   m2            ; m2   = yt*yt
> +        addps       m6,   m2            ; Syy += m2

Potential use case for 2x FMA instead of 2x mul+add.

> +%%restore_sign_loop:
> +        movaps      m0,   [tmpY + r4q]  ; m0 = Y[i]
> +        movups      m1,   [inXq + r4q]  ; m1 = X[i]
> +        andps       m1,   m2            ; m1 = sign(X[i])
> +        orps        m0,   m1            ; m0 = Y[i]*sign
> +        cvtps2dq    m3,   m0            ; m3 = (int)m0
> +        movaps      [outYq + r4q], m3

The orps instruction could be done using a memory operand instead of
movaps+orps.

AVX supports unaligned memory operands so the same could conditionally
be done for the andps as well but that's a bit of a nit.

> +%if ARCH_X86_64 == 0    ;sbrdsp
> +        movss       r0m,  xmm6      ; return (float)Syy_norm
> +        fld         dword r0m
> +%else
> +        movaps      m0,   m6        ; return (float)Syy_norm
> +%endif
> +        RET

The reg-reg movaps on x64 could be optimized away by reordering
registers so that the return value ends up in m0 instead of m6 from
the start.

> +INIT_XMM avx

The code has a bunch of if conditions for ymm regs but ymm regs are
never actually used?