[FFmpeg-devel] [PATCH]v6 Opus Pyramid Vector Quantization Search in x86 SIMD asm
Henrik Gramner
henrik at gramner.com
Mon Jul 31 19:10:40 EEST 2017
On Wed, Jul 26, 2017 at 4:56 PM, Ivan Kalvachev <ikalvachev at gmail.com> wrote:
> +++ b/libavcodec/x86/opus_pvq_search.asm
Generic minor stuff:
Use rN instead of rNq for numbered registers (q suffix is used for
named args only due to preprocessor limitations).
Use the same "standard" vertical alignment rules as most existing
code, e.g. instructions indented by 4 spaces and operands aligned on
the first comma.
Use xmN instead of xmmN (only really makes a difference when SWAP:s
are used, but might as well do it "correctly").
Use 32-bit operands, e.g. rNd, when 64-bit math isn't required.
Unless aligning every single loop entry helps a lot I'd avoid it since
it does waste a bit of icache.
Explicitly using the carry flag as a branch condition is a bit weird.
Generally using jg/jl/jge/jle tends to be clearer.
> +%ifdef __NASM_VER__
> +%use "smartalign"
> +ALIGNMODE p6
> +%endif
Assembler-specific ifdeffery should be avoided in individual files.
Something equivalent already exists in x86inc actually but I don't
remember if it was merged to FFmpeg from upstream (x264) yet.
> +const_int32_offsets:
> + %rep 8
> + dd $-const_int32_offsets
> + %endrep
Isn't this just an overly complicated way of expressing "dd 0*4, 1*4,
2*4, 3*4, 4*4, 5*4, 6*4, 7*4"?
> +SECTION .text
> +
> +
> +
> +
Reduce some of the excessive whitespace.
> +%macro HSUMPS 2 ; dst/src, tmp
> +%if cpuflag(avx)
> + %if sizeof%1>=32 ; avx
> + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1)
> + vaddps %1, %2
> + %endif
> + vshufps %2, %1, %1, q1032
> + vaddps %1, %2
> + vshufps %2, %1, %1, q0321
> + vaddps %1, %2
> +
> +%else ; this form is a bit faster than the short avx-like emulation.
> + movaps %2, %1 ; [d, c, b, a ]
> + shufps %1, %1, q1032 ; %2=[b, a, d, c ]
> + addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ]
> + movaps %2, %1
> + shufps %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ]
> + addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ]
> + ; all %1 members should be equal for as long as float a+b==b+a
> +%endif
> +%endmacro
Is reordering moves for the non-AVX path worth the additional
complexity? Microoptimizations that only affect legacy hardware are
IMO a bit questionable.
> +%macro EMU_pblendvb 3 ; dst/src_a, src_b, mask
> +%if cpuflag(avx)
> + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
> + %error AVX1 does not support integer 256 bit ymm operations
> + %endif
> +
> + vpblendvb %1, %1, %2, %3
> +;-------------------------
> +%elif cpuflag(sse4)
> + %ifnidn %3,xmm0
> + %error sse41 blendvps uses xmm0 as default 3 operand, you used %3
> + %endif
> + pblendvb %1, %2, %3
> +;-------------------------
> +%else
> + pxor %2, %1
> + pand %2, %3
> + pxor %1, %2
> +%endif
> +%endmacro
The AVX1 integer warning is kind of silly and doesn't really serve any purpose.
> +%macro EMU_broadcastss 2 ; dst, src
> +%if cpuflag(avx2)
> + vbroadcastss %1, %2 ; ymm, xmm
> +;-------------------------
> +%elif cpuflag(avx)
> + %ifnum sizeof%2 ; avx1 register
> + vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm
> + %if sizeof%1 >= 32 ; mmsize>=32
> + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im
> + %endif
> + %else ; avx1 memory
> + vbroadcastss %1, %2 ; ymm, mm32 || xmm, mm32
> + %endif
> +;-------------------------
> +%else
> + %ifnum sizeof%2 ; sse register
> + shufps %1, %2, %2, q0000
> + %else ; sse memory
> + movss %1, %2 ; this zeroes the other 3 elements
> + shufps %1, %1, 0
> + %endif
> +%endif
> +%endmacro
Use the existing x86util VBROADCASTSS macro. Modify it if it's lacking
something.
+%macro EMU_pbroadcastd 2 ; dst, src
+%if cpuflag(avx2)
+ vpbroadcastd %1, %2
+%elif cpuflag(avx) && mmsize >= 32
+ %error AVX1 does not support integer 256 bit ymm operations
+%else
+ %ifnum sizeof%2 ; sse2 register
+ pshufd %1, %2, q0000
+ %else ; sse memory
+ movd %1, %2 ; movd zero extends
+ pshufd %1, %1, 0
+ %endif
+%endif
+%endmacro
Same as above, but using the SPLATD macro.
> +; Merge parallel maximums final round (1 vs 1)
> + movaps xmm0, xmm3 ; m0 = m3[0] = p[0]
> + shufps xmm3, xmm3, q1111 ; m3 = m3[1] = p[1]
> + cmpless xmm0, xmm3
Use 3-arg instead of reg-reg movaps.
Use the actual cmpss instruction instead of psuedo-instructions.
x86inc doesn't support automatic VEX handling for every possible
psuedo-instruction (mainly because there's so many of them) which will
result in AVX state transitions if INIT_YMM is used.
> +%macro SET_PIC_BASE 3; reg, const_label
> +%ifdef PIC
> + %{1} %2, [%3] ; lea r5q, [rip+const]
> + %define pic_base_%3 %2
> +%else
> + %define pic_base_%3 %3
> +%endif
> +%endmacro
[...]
> + SET_PIC_BASE lea, r5q, const_align_abs_edge ; rip+const
> + movups m3, [pic_base_const_align_abs_edge + r4q - mmsize]
This macro is only ever used once, remove and inline it to reduce complexity.
> +%if num_mmregs > 8
> +%define num_hireg_used 3
> +%endif
IIRC the number of used mmregs is ignored in 32-bit mode so I believe
you can unconditionally specify 11.
> + lea r4q, [Nq - mmsize] ; Nq is rounded up (aligned up) to mmsize, so r4q can't become negative here, unless N=0.
> + movups m2, [inXq + r4q]
> + andps m2, m3
> + movaps [tmpX + r4q], m2
> + movaps m1, m2 ; Sx
Redundant reg-reg movaps, use m1 directly instead of using m2 at all.
> + %if PRESEARCH_ROUNDING == 0
> + cvttps2dq m2, m2 ; yt = (int)truncf( res*X[i] )
> + %else
> + cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
> + %endif
> + paddd m5, m2 ; Sy += yt
> + cvtdq2ps m2, m2 ; yt = (float)yt
Would using roundps instead of doing float-int-float conversion be beneficial?
> + mulps m1, m2 ; m1 = X[i]*yt
> + movaps [tmpY + r4q], m2 ; y[i] = m2
> + addps m7, m1 ; Sxy += m1;
> + mulps m2, m2 ; m2 = yt*yt
> + addps m6, m2 ; Syy += m2
Potential use case for 2x FMA instead of 2x mul+add.
> +%%restore_sign_loop:
> + movaps m0, [tmpY + r4q] ; m0 = Y[i]
> + movups m1, [inXq + r4q] ; m1 = X[i]
> + andps m1, m2 ; m1 = sign(X[i])
> + orps m0, m1 ; m0 = Y[i]*sign
> + cvtps2dq m3, m0 ; m3 = (int)m0
> + movaps [outYq + r4q], m3
The orps instruction could be done using a memory operand instead of
movaps+orps.
AVX supports unaligned memory operands so the same could conditionally
be done for the andps as well but that's a bit of a nit.
> +%if ARCH_X86_64 == 0 ;sbrdsp
> + movss r0m, xmm6 ; return (float)Syy_norm
> + fld dword r0m
> +%else
> + movaps m0, m6 ; return (float)Syy_norm
> +%endif
> + RET
The reg-reg movaps on x64 could be optimized away by reordering
registers so that the return value ends up in m0 instead of m6 from
the start.
> +INIT_XMM avx
The code has a bunch of if conditions for ymm regs but ymm regs are
never actually used?
More information about the ffmpeg-devel
mailing list