[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

Martin Vignali martin.vignali at gmail.com
Sun Dec 3 20:55:05 EET 2017


> Can you post a disassembly of hflip_byte_c?
>
>
> in O1 : clang -S -O1 test_asm_gen.c

    .section    __TEXT,__text,regular,pure_instructions
    .macosx_version_min 10, 12
    .globl    _hflip_byte_c
    .p2align    4, 0x90
_hflip_byte_c:                          ## @hflip_byte_c
    .cfi_startproc
## BB#0:
    pushq    %rbp
Ltmp0:
    .cfi_def_cfa_offset 16
Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp2:
    .cfi_def_cfa_register %rbp
    testl    %edx, %edx
    jle    LBB0_3
## BB#1:
    movl    %edx, %eax
    .p2align    4, 0x90
LBB0_2:                                 ## =>This Inner Loop Header: Depth=1
    movzbl    (%rdi), %ecx
    movb    %cl, (%rsi)
    decq    %rdi
    incq    %rsi
    decq    %rax
    jne    LBB0_2
LBB0_3:
    popq    %rbp
    retq
    .cfi_endproc


.subsections_via_symbols






in O2 or O3 : clang -S -O3 test_asm_gen.c

If i correctly understand, same idea than paul's patch
but processing two xmm in the main loop

    .section    __TEXT,__text,regular,pure_instructions
    .macosx_version_min 10, 12
    .section    __TEXT,__literal16,16byte_literals
    .p2align    4
LCPI0_0:
    .byte    15                      ## 0xf
    .byte    14                      ## 0xe
    .byte    13                      ## 0xd
    .byte    12                      ## 0xc
    .byte    11                      ## 0xb
    .byte    10                      ## 0xa
    .byte    9                       ## 0x9
    .byte    8                       ## 0x8
    .byte    7                       ## 0x7
    .byte    6                       ## 0x6
    .byte    5                       ## 0x5
    .byte    4                       ## 0x4
    .byte    3                       ## 0x3
    .byte    2                       ## 0x2
    .byte    1                       ## 0x1
    .byte    0                       ## 0x0
    .section    __TEXT,__text,regular,pure_instructions
    .globl    _hflip_byte_c
    .p2align    4, 0x90
_hflip_byte_c:                          ## @hflip_byte_c
    .cfi_startproc
## BB#0:
    pushq    %rbp
Ltmp0:
    .cfi_def_cfa_offset 16
Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp2:
    .cfi_def_cfa_register %rbp
                                        ## kill: %EDX<def> %EDX<kill>
%RDX<def>
    testl    %edx, %edx
    jle    LBB0_17
## BB#1:
    movl    %edx, %r8d
    cmpl    $32, %edx
    jae    LBB0_3
## BB#2:
    xorl    %r11d, %r11d
    jmp    LBB0_11
LBB0_3:
    andl    $31, %edx
    movq    %r8, %r11
    subq    %rdx, %r11
    je    LBB0_7
## BB#4:
    leaq    1(%rdi), %rax
    cmpq    %rsi, %rax
    jbe    LBB0_8
## BB#5:
    leaq    (%rsi,%r8), %r9
    movl    $1, %eax
    subq    %r8, %rax
    addq    %rdi, %rax
    cmpq    %r9, %rax
    jae    LBB0_8
LBB0_7:
    xorl    %r11d, %r11d
    jmp    LBB0_11
LBB0_8:
    leaq    -15(%rdi), %r9
    leaq    16(%rsi), %rax
    movdqa    LCPI0_0(%rip), %xmm0    ## xmm0 =
[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
    movq    %r11, %r10
    .p2align    4, 0x90
LBB0_9:                                 ## =>This Inner Loop Header: Depth=1
    movdqu    -16(%r9), %xmm1
    movdqu    (%r9), %xmm2
    pshufb    %xmm0, %xmm2
    pshufb    %xmm0, %xmm1
    movdqu    %xmm2, -16(%rax)
    movdqu    %xmm1, (%rax)
    addq    $-32, %r9
    addq    $32, %rax
    addq    $-32, %r10
    jne    LBB0_9
## BB#10:
    testl    %edx, %edx
    je    LBB0_17
LBB0_11:
    movl    %r8d, %eax
    subl    %r11d, %eax
    leaq    -1(%r8), %r9
    subq    %r11, %r9
    andq    $3, %rax
    je    LBB0_14
## BB#12:
    movq    %rdi, %rdx
    subq    %r11, %rdx
    negq    %rax
    .p2align    4, 0x90
LBB0_13:                                ## =>This Inner Loop Header: Depth=1
    movzbl    (%rdx), %ecx
    movb    %cl, (%rsi,%r11)
    incq    %r11
    decq    %rdx
    incq    %rax
    jne    LBB0_13
LBB0_14:
    cmpq    $3, %r9
    jb    LBB0_17
## BB#15:
    subq    %r11, %r8
    subq    %r11, %rdi
    leaq    3(%rsi,%r11), %rax
    .p2align    4, 0x90
LBB0_16:                                ## =>This Inner Loop Header: Depth=1
    movzbl    (%rdi), %ecx
    movb    %cl, -3(%rax)
    movzbl    -1(%rdi), %ecx
    movb    %cl, -2(%rax)
    movzbl    -2(%rdi), %ecx
    movb    %cl, -1(%rax)
    movzbl    -3(%rdi), %ecx
    movb    %cl, (%rax)
    addq    $-4, %rdi
    addq    $4, %rax
    addq    $-4, %r8
    jne    LBB0_16
LBB0_17:
    popq    %rbp
    retq
    .cfi_endproc


.subsections_via_symbols


More information about the ffmpeg-devel mailing list