[FFmpeg-devel] [patch][OpenHEVC]added ASM functions for epel + qpel

Ronald S. Bultje rsbultje at gmail.com
Fri Mar 7 14:21:06 CET 2014


Hi,

> +%macro EPEL_FILTER 2                             ; bit depth, filter
index
> +%ifdef PIC
> +    lea         rfilterq, [hevc_epel_filters_sse4_%1]
> +%else
> +    %define rfilterq hevc_epel_filters_sse4_%1
> +%endif
> +    sub              %2q, 1
> +    shl              %2q, 5                      ; multiply by 32
> +    movdqa           m14, [rfilterq + %2q]        ; get 2 first values
of filters
> +    movdqa           m15, [rfilterq + %2q+16]     ; get 2 last values of
filters
> +%endmacro

lea %2q, [%2q*8-8]
movdqa m14, [rfilterq+%2q*4]
movdqa m15, [rfilterq+%2q*4+16]

or

shl %2q, 5
movdqa m14, [rfilterq+%2q-32]
movdqa m15, [rfilterq+%2q-16]

i.e. remove the sub.

> +%macro EPEL_HV_FILTER 1
> +%ifdef PIC
> +    lea         rfilterq, [hevc_epel_filters_sse4_%1]
> +%else
> +    %define rfilterq hevc_epel_filters_sse4_%1
> +%endif
> +    sub              mxq, 1
> +    sub              myq, 1
> +    shl              mxq, 5                      ; multiply by 32
> +    shl              myq, 5                      ; multiply by 32
> +    movdqa           m14, [rfilterq + mxq]        ; get 2 first values
of filters
> +    movdqa           m15, [rfilterq + mxq+16]     ; get 2 last values of
filters
[..]
> +%ifdef PIC
> +    lea         rfilterq, [hevc_epel_filters_sse4_10]
> +%else
> +    %define rfilterq hevc_epel_filters_sse4_10
> +%endif
> +    movdqa           m12, [rfilterq + myq]        ; get 2 first values
of filters
> +    movdqa           m13, [rfilterq + myq+16]     ; get 2 last values of
filters
> +%endmacro

Same, remove the subs.

> +    lea           r3srcq, [srcstrideq*3]

(That's a mildly weird register name, "r3src", I'd have called it
srcstride3 or so.)

> +%macro QPEL_FILTER 2
> +%ifdef PIC
> +    lea         rfilterq, [hevc_qpel_filters_sse4_%1]
> +%else
> +    %define rfilterq hevc_qpel_filters_sse4_%1
> +%endif
> +    sub              %2q, 1
> +    shl              %2q, 6                      ; multiply by 16
> +    movdqa           m12, [rfilterq + %2q]       ; get 4 first values of
filters
> +    movdqa           m13, [rfilterq + %2q + 16]  ; get 4 first values of
filters
> +    movdqa           m14, [rfilterq + %2q + 32]  ; get 4 first values of
filters
> +    movdqa           m15, [rfilterq + %2q + 48]  ; get 4 first values of
filters
> +%endmacro

Remove the sub.

> +%macro EPEL_LOAD 4
> +%ifdef PIC
> +    lea rfilterq, [%2]
> +%else
> +    %define rfilterq %2
> +%endif
> +    movdqu            m0, [rfilterq ]            ;load 128bit of x

Wait, what? Why are you lea'ing here? This (%2) is an address, not a label,
the lea does nothing.

> +%ifnum %3
> +    movdqu            m1, [rfilterq+  %3]        ;load 128bit of x+stride
> +    movdqu            m2, [rfilterq+2*%3]        ;load 128bit of
x+2*stride
> +    movdqu            m3, [rfilterq+3*%3]        ;load 128bit of
x+3*stride
> +%else
> +    movdqu            m1, [rfilterq+  %3q]       ;load 128bit of x+stride
> +    movdqu            m2, [rfilterq+2*%3q]       ;load 128bit of
x+2*stride
> +    movdqu            m3, [rfilterq+r3srcq]      ;load 128bit of
x+2*stride
> +%endif

I think I mentioned before that if %4 <= 4, you should use movd, and if %4
== 8, you should use movq. Check agner for cycle counts, but it's faster.

> +%macro QPEL_H_LOAD 3
> +%assign %%stride (%1+7)/8
> +
> +    movdqu            m0, [%2-3*%%stride]       ; load data from source
> +    movdqu            m1, [%2-2*%%stride]
> +    movdqu            m2, [%2-%%stride]
> +    movdqu            m3, [%2  ]
> +    movdqu            m4, [%2+%%stride]
> +    movdqu            m5, [%2+2*%%stride]
> +    movdqu            m6, [%2+3*%%stride]
> +    movdqu            m7, [%2+4*%%stride]
> +

if %1*%%stride <= 4, use movd; if 8, use movq.

> +%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
> +%if %2 == 8
> +%if %1 > 8
> +    movhlps           m1, m0
> +    pmovzxbw          m1, m1
> +    psllw             m1, 14-%2
> +%endif
> +    pmovzxbw          m0, m0
> +%endif
> +    psllw             m0, 14-%2
> +%endmacro

Ah, I see your sse4 instructions. Is this faster than a simple pxor at the
start of the loop (_only in the functions that use this macro_ - so only
fullpel) and punpcklbw/punpckhbw here (or SBUTTERFLY)? I'd say that this
being ssse3 would be a major advantage.

> +%macro HEVC_PUT_HEVC_PEL_PIXELS 2
> +cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src,
srcstride,height
> +.loop
> +%if   %2 == 8
> +%if   %1 == 4
> +    movd              m0, [srcq]
      ; load data from source

if %1 <= 4? Else %1 == 2 will use full movdqu.

> +cglobal hevc_put_hevc_pel_pixels24_8, 5, 5, 2, dst, dststride, src,
srcstride,height
> +.loop
> +    movdqu            m0, [srcq]                 ; load data from source
> +    movhlps           m1, m0
> +    pmovzxbw          m0, m0
> +    pmovzxbw          m1, m1
> +    psllw             m0, 6
> +    psllw             m1, 6
> +    movdqa        [dstq], m0
> +    movdqa   [dstq + 16], m1                     ; store 16
> +    movq              m0, [srcq + 16]
> +    pmovzxbw          m0, m0
> +    psllw             m0, 6
> +    movdqa   [dstq + 32], m0
> +    LOOP_END         dst, dststride, src, srcstride
> +    RET

Don't forget to change this to use ssse3 also (punpcklbw/punpckhbw with a
pre-loop initialized pxor, instead of movhlps+pmovzxbw).

More to come.

Ronald


More information about the ffmpeg-devel mailing list