[FFmpeg-devel] HEVC patch : adding ASM functions

Clément Bœsch u at pkh.me
Sun Feb 23 19:21:21 CET 2014


On Fri, Feb 21, 2014 at 04:07:53PM +0100, Pierre Edouard Lepere wrote:
[...]
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +cextern hevc_epel_filters
> +cextern hevc_qpel_filters
> +
> +epel_extra_before   DB  1                        ;corresponds to EPEL_EXTRA_BEFORE in hevc.h
> +max_pb_size         DB  64                       ;corresponds to MAX_PB_SIZE in hevc.h
> +
> +qpel_extra_before   DB  0,  3,  3,  2            ;corresponds to the ff_hevc_qpel_extra_before in hevc.c
> +qpel_extra          DB  0,  6,  7,  6            ;corresponds to the ff_hevc_qpel_extra in hevc.c
> +qpel_extra_after    DB  0,  3,  4,  4
> +
> +
> +single_mask_16      DW  0,  0,  0,  0,  0,  0,  0, -1
> +single_mask_32      DW  0,  0,  0,  0,  0,  0, -1, -1

The common convention is to use ':' after labels, and no caps for
db/dw/etc

> +SECTION .text
> +

> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

?

> +%macro LOOP_INIT 2
> +    pxor             m15, m15                    ; set register at zero
> +    mov              r10, 0                      ; set height counter
> +%1:
> +    mov               r9, 0                      ; set width counter
> +%2:

make sure these labels are prefixed with a '.'

> +%endmacro
> +
> +%macro LOOP_END 7
> +    add               r9, %3                     ; add 4 for width loop
> +    cmp               r9, widthq                 ; cmp width
> +    jl                %2                         ; width loop
> +%ifidn %5, dststride
> +    lea              %4q, [%4q+2*%5q]           ; dst += dststride
> +%else
> +    lea              %4q, [%4q+  %5 ]           ; dst += dststride
> +%endif

> +%ifidn %7, srcstride
> +    lea              %6q, [%6q+  %7q]           ; src += srcstride
> +%else
> +    lea              %6q, [%6q+  %7 ]           ; src += srcstride
> +%endif

Is there a place where it's not srcstride?

If not, you probably want to use the default value system with something
like:

%macro LOOP_END 6-7 srcstrideq
...

> +    add              r10, 1
> +    cmp              r10, heightq                ; cmp height
> +    jl                %1                         ; height loop
> +%endmacro
> +
> +%macro LOOP_EPEL_HV_INIT 4
> +    pxor             m15, m15                    ; set register at zero
> +    mov               r9, 0                      ; set width counter
> +%1:
> +    mov              r10, 0                      ; set height counter
> +;first 3 H calculus
> +%if %3 == 8
> +    EPEL_LOAD         %3, mx, 1
> +%else
> +    EPEL_LOAD         %3, mx, 2
> +%endif
> +    EPEL_COMPUTE      %3, %4
> +    SWAP              m4, m0
> +    lea              mxq, [mxq + srcstrideq]
> +%if %3 == 8
> +    EPEL_LOAD         %3, mx, 1
> +%else
> +    EPEL_LOAD         %3, mx, 2
> +%endif
> +    EPEL_COMPUTE      %3, %4
> +    SWAP              m5, m0
> +    lea              mxq, [mxq + srcstrideq]
> +%if %3 == 8
> +    EPEL_LOAD         %3, mx, 1
> +%else
> +    EPEL_LOAD         %3, mx, 2
> +%endif

instead of this %if ... %else ..., can you either do something like:
EPEL_LOAD %3, mx, 1 + (%3 == 8)
or simply add the condition on %3 in the macro itself?

> +    EPEL_COMPUTE      %3, %4
> +    SWAP              m6, m0
> +    lea              mxq, [mxq + srcstrideq]
> +%2:

. prefix

> +%endmacro
> +
> +%macro LOOP_HV_END 9
> +    lea              %8q, [%8q+2*%5q]            ; dst += dststride
> +    lea              %9q, [%9q+  %7q]            ; src += srcstride
> +    inc              r10                         ; add 1 for height loop
> +    cmp              r10, heightq                ; cmp height
> +    jl                %2                         ; height loop
> +    add               r9, %3                     ; add width
> +    lea              %8q,[%4q]                   ; dst = dst2
> +    lea              %9q,[%6q]                   ; src = src2
> +    cmp               r9, widthq                 ; cmp width
> +    jl                %1                         ; weight loop
> +%endmacro
> +
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +
> +
> +%macro EPEL_FILTER 2
> +    movsxd           %2q, %2d                    ; extend sign
> +    sub              %2q, 1
> +    shl              %2q, 2                      ; multiply by 4
> +    lea              r11, [hevc_epel_filters]
> +    movq             m13, [r11 + %2q]            ; get 4 first values of filters
> +%if %1 == 8
> +    punpcklwd        m13, m13
> +%else
> +    pmovsxbw         m13, m13
> +%endif
> +    punpckldq        m13, m13
> +    movdqa           m14, m13
> +    punpckhqdq       m14, m14                    ; contains last 2 filters
> +    punpcklqdq       m13, m13                    ;
> +%endmacro
> +
> +%macro EPEL_HV_FILTER 1
> +    movsxd           mxq, mxd                    ; extend sign
> +    movsxd           myq, myd                    ; extend sign
> +    sub              mxq, 1
> +    sub              myq, 1
> +    shl              mxq, 2                      ; multiply by 4
> +    shl              myq, 2                      ; multiply by 4
> +    lea              r11, [hevc_epel_filters]
> +    movq             m13, [r11 + mxq]            ; get 4 first values of H filter
> +    movq             m11, [r11 + myq]

> +%if %1 == 8
> +    punpcklwd        m13, m13
> +    pmovsxbw         m11, m11                    ; need 16bit filters for V
> +%else
> +    pmovsxbw         m13, m13
> +    pmovsxbw         m11, m11
> +%endif

pmovsxbw looks common to both

> +    punpckldq        m13, m13
> +    punpckldq        m11, m11
> +    movdqa           m14, m13
> +    movdqa           m12, m11
> +    punpckhqdq       m14, m14                    ;H
> +    punpckhqdq       m12, m12                    ;V
> +    punpcklqdq       m13, m13                    ;H
> +    punpcklqdq       m11, m11                    ;V
> +%endmacro
> +
> +
> +%macro QPEL_H_FILTER 2
> +    movsxd           %2q, %2d                    ; extend sign
> +    sub              %2q, 1
> +    shl              %2q, 4                      ; multiply by 16
> +    lea              r11, [hevc_qpel_filters]
> +    movq             m11, [r11 + %2q]            ; get 4 first values of filters
> +%if %1 == 8
> +    punpcklwd        m11, m11
> +%else
> +    pmovsxbw         m11, m11
> +%endif
> +    punpckhdq        m13, m11, m11
> +    punpckldq        m11, m11
> +    movdqa           m14, m13
> +    movdqa           m12, m11
> +    punpckhqdq       m14, m14                    ; contains last 2 filters
> +    punpcklqdq       m13, m13                    ;
> +    punpckhqdq       m12, m12                    ;
> +    punpcklqdq       m11, m11                    ; contains first 2 filters
> +
> +%endmacro
> +
> +%macro QPEL_V_FILTER 2
> +    movdqu            m6, [qpel_h_filter%1_10]
> +    psrldq            m7, m6, 2
> +    psrldq            m8, m6, 4
> +    psrldq            m9, m6, 6
> +    psrldq           m10, m6, 8
> +    psrldq           m11, m6, 10
> +    psrldq           m12, m6, 12
> +    psrldq           m13, m6, 14
> +
> +    punpcklwd         m6, m6                    ; put double values to 32bit.
> +    punpcklwd         m7, m7
> +    punpcklwd         m8, m8
> +    punpcklwd         m9, m9
> +    punpcklwd        m10, m10
> +    punpcklwd        m11, m11
> +    punpcklwd        m12, m12
> +    punpcklwd        m13, m13
> +

> +%if %2 == 10
> +    psrad             m6, m6, 16
> +    psrad             m7, m7, 16
> +    psrad             m8, m8, 16
> +    psrad             m9, m9, 16
> +    psrad            m10, m10, 16
> +    psrad            m11, m11, 16
> +    psrad            m12, m12, 16
> +    psrad            m13, m13, 16
> +%endif
> +

You can remove the first argument here

> +    pshufd            m6, m6, 0
> +    pshufd            m7, m7, 0
> +    pshufd            m8, m8, 0
> +    pshufd            m9, m9, 0
> +    pshufd           m10, m10, 0
> +    pshufd           m11, m11, 0
> +    pshufd           m12, m12, 0
> +    pshufd           m13, m13, 0

ditto.

Also maybe SPLAT* macro?

> +
> +%endmacro
> +
> +
> +
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

?

[...]
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +
> +%if ARCH_X86_64
> +INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
> +
> +; ******************************
> +; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
> +;                         uint8_t *_src, ptrdiff_t _srcstride,
> +;                         int width, int height, int mx, int my,
> +;                         int16_t* mcbuffer)
> +;

> +;        r0 : *dst
> +;        r1 : dststride
> +;        r2 : *src
> +;        r3 : srcstride
> +;        r4 : width
> +;        r5 : height

You named them in the cglobal, so this comment is not useful.

Same below

[...]

> +cglobal hevc_put_hevc_pel_pixels2_8, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> +    PUT_HEVC_MC_PIXELS 2, 8, epel
> +cglobal hevc_put_hevc_pel_pixels4_8, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> +    PUT_HEVC_MC_PIXELS 4, 8, epel
> +cglobal hevc_put_hevc_pel_pixels8_8, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> +    PUT_HEVC_MC_PIXELS 8, 8, epel
> +cglobal hevc_put_hevc_pel_pixels16_8, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> +    PUT_HEVC_MC_PIXELS 16, 8, epel
> +
> +cglobal hevc_put_hevc_pel_pixels2_10, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> +    PUT_HEVC_MC_PIXELS 2, 10, epel
> +cglobal hevc_put_hevc_pel_pixels4_10, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> +    PUT_HEVC_MC_PIXELS 4, 10, epel
> +cglobal hevc_put_hevc_pel_pixels8_10, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> +    PUT_HEVC_MC_PIXELS 8, 10, epel

Move the cglobal into the macro?

Same for all the others below

[...]

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140223/ad2fdb54/attachment.asc>


More information about the ffmpeg-devel mailing list