[FFmpeg-devel] [PATCH] x86: hevc: adding transform_add

James Almer jamrial at gmail.com
Wed Jul 30 20:39:53 CEST 2014


On 30/07/14 10:33 AM, Pierre Edouard Lepere wrote:
> Hi,
> 
> Here's a patch adding ASM transform_add functions for HEVC.
> 
> Regards,
> Pierre-Edouard Lepere

Some remarks below.

> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 7469293..658ad5e 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
>  YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
>  YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
>                                            x86/hevc_deblock.o            \
> -                                          x86/hevc_idct.o
> +                                          x86/hevc_idct.o               \
> +                                          x86/hevc_res_add.o

Why not adding this to hevc_idct.c? It technically used to be there before the rext split.

>  YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
>  YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
>  YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
> diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
> new file mode 100644
> index 0000000..bc550ef
> --- /dev/null
> +++ b/libavcodec/x86/hevc_res_add.asm
> @@ -0,0 +1,454 @@
> +; /*
> +; * Provide intrinsics for transform_add functions for HEVC decoding

It's not intrisics anymore :P

> +; * Copyright (c) 2014 Pierre-Edouard LEPERE
> +; *
> +; * This file is part of FFmpeg.
> +; *
> +; * FFmpeg is free software; you can redistribute it and/or
> +; * modify it under the terms of the GNU Lesser General Public
> +; * License as published by the Free Software Foundation; either
> +; * version 2.1 of the License, or (at your option) any later version.
> +; *
> +; * FFmpeg is distributed in the hope that it will be useful,
> +; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +; * Lesser General Public License for more details.
> +; *
> +; * You should have received a copy of the GNU Lesser General Public
> +; * License along with FFmpeg; if not, write to the Free Software
> +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +; */
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA

SECTION_RODATA 32

It's needed for the AVX2 functions.

> +max_pixels_10:          times 16  dw ((1 << 10)-1)
> +tr_add_10:              times 4 dd ((1 << 14-10) + 1)
> +
> +
> +SECTION .text
> +
> +;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
> +%macro TR_ADD_INIT_MMX 2
> +    mova              m2, [r1]
> +    mova              m4, [r1+8]
> +    pxor              m3, m3
> +    psubw             m3, m2
> +    packuswb          m2, m2
> +    packuswb          m3, m3
> +    pxor              m5, m5
> +    psubw             m5, m4
> +    packuswb          m4, m4
> +    packuswb          m5, m5
> +%endmacro
> +
> +%macro TR_ADD_OP_MMX 4

This macro is used only by one function. %1 is always movh.
Just hardcode it like the movas above.

> +    %1                m0, [%2     ]
> +    %1                m1, [%2+%3  ]
> +    paddusb           m0, m2
> +    paddusb           m1, m4
> +    psubusb           m0, m3
> +    psubusb           m1, m5
> +    %1         [%2     ], m0
> +    %1         [%2+%3  ], m1
> +%endmacro
> +
> +%macro TR_ADD_INIT_SSE_8 2
> +    movu              m4, [r1]
> +    movu              m6, [r1+16]
> +    movu              m8, [r1+32]
> +    movu             m10, [r1+48]
> +    lea               %1, [%2*3]
> +    pxor              m5, m5
> +    psubw             m5, m4
> +    packuswb          m4, m4
> +    packuswb          m5, m5
> +    pxor              m7, m7
> +    psubw             m7, m6
> +    packuswb          m6, m6
> +    packuswb          m7, m7
> +    pxor              m9, m9
> +    psubw             m9, m8
> +    packuswb          m8, m8
> +    packuswb          m9, m9
> +    pxor             m11, m11
> +    psubw            m11, m10
> +    packuswb         m10, m10
> +    packuswb         m11, m11
> +%endmacro
> +
> +%macro TR_ADD_INIT_SSE_16 2
> +    lea               %1, [%2*3]
> +    movu              m4, [r1]
> +    movu              m6, [r1+16]
> +    pxor              m5, m5
> +    psubw             m7, m5, m6
> +    psubw             m5, m4
> +    packuswb          m4, m6
> +    packuswb          m5, m7
> +
> +    movu              m6, [r1+32]
> +    movu              m8, [r1+48]
> +    pxor              m7, m7
> +    psubw             m9, m7, m8
> +    psubw             m7, m6
> +    packuswb          m6, m8
> +    packuswb          m7, m9
> +
> +    movu              m8, [r1+64]
> +    movu             m10, [r1+80]
> +    pxor              m9, m9
> +    psubw            m11, m9, m10
> +    psubw             m9, m8
> +    packuswb          m8, m10
> +    packuswb          m9, m11
> +
> +    movu             m10, [r1+96]
> +    movu             m12, [r1+112]
> +    pxor             m11, m11
> +    psubw            m13, m11, m12
> +    psubw            m11, m10
> +    packuswb         m10, m12
> +    packuswb         m11, m13
> +%endmacro
> +
> +%macro TR_ADD_OP_SSE 4
> +    %1                m0, [%2     ]
> +    %1                m1, [%2+%3  ]
> +    %1                m2, [%2+%3*2]
> +    %1                m3, [%2+%4  ]
> +    paddusb           m0, m4
> +    paddusb           m1, m6
> +    paddusb           m2, m8
> +    paddusb           m3, m10
> +    psubusb           m0, m5
> +    psubusb           m1, m7
> +    psubusb           m2, m9
> +    psubusb           m3, m11
> +    %1         [%2     ], m0
> +    %1         [%2+%3  ], m1
> +    %1         [%2+2*%3], m2
> +    %1         [%2+%4  ], m3
> +%endmacro
> +
> +%macro TR_ADD_OP_SSE_32 4
> +    %1                m0, [%2      ]
> +    %1                m1, [%2+16   ]
> +    %1                m2, [%2+%3   ]
> +    %1                m3, [%2+%3+16]
> +    paddusb           m0, m4
> +    paddusb           m1, m6
> +    paddusb           m2, m8
> +    paddusb           m3, m10
> +    psubusb           m0, m5
> +    psubusb           m1, m7
> +    psubusb           m2, m9
> +    psubusb           m3, m11
> +    %1        [%2      ], m0
> +    %1        [%2+16   ], m1
> +    %1        [%2+%3   ], m2
> +    %1        [%2+%3+16], m3
> +%endmacro
> +
> +
> +INIT_MMX mmxext
> +; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add4_8, 3, 4, 6
> +    TR_ADD_INIT_MMX   r3, r2
> +    TR_ADD_OP_MMX   movh, r0, r2, r3
> +    lea               r1, [r1+16]

add r1, 16
There are several similar examples below of simple leas that can be add instead.

> +    lea               r0, [r0+r2*2]
> +    TR_ADD_INIT_MMX   r3, r2
> +    TR_ADD_OP_MMX   movh, r0, r2, r3
> +    RET
> +
> +%if ARCH_X86_64
> +INIT_XMM sse2
> +; void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add8_8, 3, 4, 6

You're using 12 xmm registers, but only declaring 6.

> +    TR_ADD_INIT_SSE_8 r3, r2
> +    TR_ADD_OP_SSE   movh, r0, r2, r3
> +    lea               r1, [r1+8*8]
> +    lea               r0, [r0+r2*4]
> +    TR_ADD_INIT_SSE_8 r3, r2
> +    TR_ADD_OP_SSE   movh, r0, r2, r3
> +    RET
> +
> +
> +; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add16_8, 3, 4, 6

Same as above, you're using 14 xmm registers here, and in the function below.
Do you really need that many? It should be possible to use less if you rearrange 
some instructions.

> +    TR_ADD_INIT_SSE_16 r3, r2
> +    TR_ADD_OP_SSE    mova, r0, r2, r3
> +%rep 3
> +    lea                r1, [r1+16*8]
> +    lea                r0, [r0+r2*4]
> +    TR_ADD_INIT_SSE_16 r3, r2
> +    TR_ADD_OP_SSE    mova, r0, r2, r3
> +%endrep
> +    RET
> +
> +; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
> +cglobal hevc_transform_add32_8, 3, 4, 6
> +    TR_ADD_INIT_SSE_16 r3, r2
> +    TR_ADD_OP_SSE_32 mova, r0, r2, r3
> +%rep 15
> +    lea                r1, [r1+16*8]
> +    lea                r0, [r0+r2*2]
> +    TR_ADD_INIT_SSE_16 r3, r2
> +    TR_ADD_OP_SSE_32 mova, r0, r2, r3
> +%endrep
> +    RET
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +
> +%endif ;HAVE_AVX2_EXTERNAL

This is not needed if you're not adding 8bit AVX2 functions for the time being.

> +%endif ;ARCH_X86_64
> +;-----------------------------------------------------------------------------
> +; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
> +;-----------------------------------------------------------------------------
> +%macro TR_ADD_OP_10 4
> +    movu              m6, [%4]
> +    movu              m7, [%4+16]
> +    movu              m8, [%4+32]
> +    movu              m9, [%4+48]
> +%if avx_enabled
> +    paddw             m0, m6, [%1+0   ]
> +    paddw             m1, m7, [%1+%2  ]
> +    paddw             m2, m8, [%1+%2*2]
> +    paddw             m3, m9, [%1+%3  ]
> +%else
> +    movu              m0, [%1+0   ]
> +    movu              m1, [%1+%2  ]
> +    movu              m2, [%1+%2*2]
> +    movu              m3, [%1+%3  ]
> +    paddw             m0, m6
> +    paddw             m1, m7
> +    paddw             m2, m8
> +    paddw             m3, m9

This makes no sense. You're using four extra registers and four more movs 
than necessary.
It would make sense if those movs needed to be unaligned, but they obviously 
don't judging by the avx version.

    mova              m0, [%4]
    mova              m1, [%4+16]
    mova              m2, [%4+32]
    mova              m3, [%4+48]
    paddw             m0, [%1+0   ]
    paddw             m1, [%1+%2  ]
    paddw             m2, [%1+%2*2]
    paddw             m3, [%1+%3  ]

You can remove the xmm AVX versions after this, since there are no more three 
operand instructions. Also, this would make the functions work with x86_32.

> +%endif
> +    CLIPW             m0, m4, m5
> +    CLIPW             m1, m4, m5
> +    CLIPW             m2, m4, m5
> +    CLIPW             m3, m4, m5
> +    movu       [%1+0   ], m0
> +    movu       [%1+%2  ], m1
> +    movu       [%1+%2*2], m2
> +    movu       [%1+%3  ], m3
> +%endmacro
> +
> +%macro TR_ADD_MMX_10 3
> +    mova              m4, [%3]
> +    mova              m5, [%3+8]
> +    mova              m0, [%1+0   ]
> +    mova              m1, [%1+%2  ]
> +    paddw             m0, m4
> +    paddw             m1, m5
> +    CLIPW             m0, m2, m3
> +    CLIPW             m1, m2, m3
> +    mova       [%1+0   ], m0
> +    mova       [%1+%2  ], m1
> +%endmacro
> +
> +%macro TRANS_ADD16_10 3
> +    mova              m6, [%3]
> +    mova              m7, [%3+16]
> +    mova              m8, [%3+32]
> +    mova              m9, [%3+48]
> +
> +%if avx_enabled
> +    paddw             m0, m6, [%1      ]
> +    paddw             m1, m7, [%1+16   ]
> +    paddw             m2, m8, [%1+%2   ]
> +    paddw             m3, m9, [%1+%2+16]
> +%else
> +    mova              m0, [%1      ]
> +    mova              m1, [%1+16   ]
> +    mova              m2, [%1+%2   ]
> +    mova              m3, [%1+%2+16]
> +    paddw             m0, m6
> +    paddw             m1, m7
> +    paddw             m2, m8
> +    paddw             m3, m9

Same as above, here and below.

> +%endif
> +    CLIPW             m0, m4, m5
> +    CLIPW             m1, m4, m5
> +    CLIPW             m2, m4, m5
> +    CLIPW             m3, m4, m5
> +    mova      [%1      ], m0
> +    mova      [%1+16   ], m1
> +    mova      [%1+%2   ], m2
> +    mova      [%1+%2+16], m3
> +%endmacro
> +
> +%macro TRANS_ADD32_10 2
> +    mova              m6, [%2]
> +    mova              m7, [%2+16]
> +    mova              m8, [%2+32]
> +    mova              m9, [%2+48]
> +
> +%if avx_enabled
> +    paddw             m0, m6, [%1   ]
> +    paddw             m1, m7, [%1+16]
> +    paddw             m2, m8, [%1+32]
> +    paddw             m3, m9, [%1+48]
> +%else
> +    mova              m0, [%1   ]
> +    mova              m1, [%1+16]
> +    mova              m2, [%1+32]
> +    mova              m3, [%1+48]
> +    paddw             m0, m6
> +    paddw             m1, m7
> +    paddw             m2, m8
> +    paddw             m3, m9
> +%endif
> +    CLIPW             m0, m4, m5
> +    CLIPW             m1, m4, m5
> +    CLIPW             m2, m4, m5
> +    CLIPW             m3, m4, m5
> +    mova         [%1   ], m0
> +    mova         [%1+16], m1
> +    mova         [%1+32], m2
> +    mova         [%1+48], m3
> +%endmacro
> +
> +
> +
> +%macro TRANS_ADD16_AVX2 4

The macro names are a bit confusing. Some state the bitdepth, others don't, etc.
It's mostly cosmetic and not really important, but it would be nice if the names 
were more consistent.

> +    movu              m6, [%4]
> +    movu              m7, [%4+32]
> +    movu              m8, [%4+64]
> +    movu              m9, [%4+96]
> +
> +    paddw             m0, m6, [%1+0   ]
> +    paddw             m1, m7, [%1+%2  ]
> +    paddw             m2, m8, [%1+%2*2]
> +    paddw             m3, m9, [%1+%3  ]

Same as the SSE2 versions.

> +
> +    CLIPW             m0, m4, m5
> +    CLIPW             m1, m4, m5
> +    CLIPW             m2, m4, m5
> +    CLIPW             m3, m4, m5
> +    movu       [%1+0   ], m0
> +    movu       [%1+%2  ], m1
> +    movu       [%1+%2*2], m2
> +    movu       [%1+%3  ], m3
> +%endmacro
> +
> +%macro TRANS_ADD32_AVX2 3
> +    movu              m6, [%3]
> +    movu              m7, [%3+32]
> +    movu              m8, [%3+64]
> +    movu              m9, [%3+96]
> +
> +    paddw             m0, m6, [%1      ]
> +    paddw             m1, m7, [%1+32   ]
> +    paddw             m2, m8, [%1+%2   ]
> +    paddw             m3, m9, [%1+%2+32]
> +
> +    CLIPW             m0, m4, m5
> +    CLIPW             m1, m4, m5
> +    CLIPW             m2, m4, m5
> +    CLIPW             m3, m4, m5
> +    movu      [%1      ], m0
> +    movu      [%1+32   ], m1
> +    movu      [%1+%2   ], m2
> +    movu      [%1+%2+32], m3
> +%endmacro
> +
> +
> +INIT_MMX mmxext
> +cglobal hevc_transform_add4_10,3,4, 6
> +    pxor              m2, m2
> +    mova              m3, [max_pixels_10]
> +    TR_ADD_MMX_10     r0, r2, r1
> +    lea               r1, [r1+16]
> +    lea               r0, [r0+2*r2]
> +    TR_ADD_MMX_10     r0, r2, r1
> +    RET
> +
> +;-----------------------------------------------------------------------------
> +; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
> +;-----------------------------------------------------------------------------
> +%macro TR_ADD8 0
> +cglobal hevc_transform_add8_10,3,4,7
> +    pxor              m4, m4
> +    mova              m5, [max_pixels_10]
> +    lea               r3, [r2*3]
> +
> +    TR_ADD_OP_10      r0, r2, r3, r1
> +    lea               r0, [r0+r2*4]
> +    lea               r1, [r1+64]
> +    TR_ADD_OP_10      r0, r2, r3, r1
> +    RET
> +%endmacro
> +
> +%macro TRANS_ADD16 0
> +cglobal hevc_transform_add16_10,3,4,7
> +    pxor              m4, m4
> +    mova              m5, [max_pixels_10]
> +
> +    TRANS_ADD16_10    r0, r2, r1
> +%rep 7
> +    lea               r0, [r0+r2*2]
> +    lea               r1, [r1+64]
> +    TRANS_ADD16_10    r0, r2, r1
> +%endrep
> +    RET
> +%endmacro
> +
> +
> +%macro TRANS_ADD32 0
> +cglobal hevc_transform_add32_10,3,4,7
> +    pxor              m4, m4
> +    mova              m5, [max_pixels_10]
> +
> +    TRANS_ADD32_10    r0, r1
> +%rep 31
> +    lea               r0, [r0+r2]
> +    lea               r1, [r1+64]
> +    TRANS_ADD32_10    r0, r1
> +%endrep
> +    RET
> +%endmacro
> +%if ARCH_X86_64
> +INIT_XMM sse2
> +TR_ADD8
> +TRANS_ADD16
> +TRANS_ADD32
> +%if HAVE_AVX_EXTERNAL
> +INIT_XMM avx
> +TR_ADD8
> +TRANS_ADD16
> +TRANS_ADD32
> +%endif
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +
> +cglobal hevc_transform_add16_10,3,4,10
> +    pxor              m4, m4
> +    mova              m5, [max_pixels_10]
> +    lea               r3, [r2*3]
> +
> +    TRANS_ADD16_AVX2  r0, r2, r3, r1
> +%rep 3
> +    lea               r0, [r0+r2*4]
> +    lea               r1, [r1+128]
> +    TRANS_ADD16_AVX2  r0, r2, r3, r1
> +%endrep
> +    RET
> +
> +cglobal hevc_transform_add32_10,3,4,10
> +    pxor              m4, m4
> +    mova              m5, [max_pixels_10]
> +
> +    TRANS_ADD32_AVX2  r0, r2, r1
> +%rep 15
> +    lea               r0, [r0+r2*2]
> +    lea               r1, [r1+128]
> +    TRANS_ADD32_AVX2  r0, r2, r1
> +%endrep
> +    RET
> +%endif ;HAVE_AVX_EXTERNAL
> +%endif ;ARCH_X86_64
> +
> diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
> index 4bcc8dc..5c3a51c 100644
> --- a/libavcodec/x86/hevcdsp.h
> +++ b/libavcodec/x86/hevcdsp.h

Not part of the review, but is this header necessary at all? Only one file 
includes it, so all these prototypes could very well be there instead.

> @@ -131,4 +131,25 @@ WEIGHTING_PROTOTYPES(8, sse4);
>  WEIGHTING_PROTOTYPES(10, sse4);
>  WEIGHTING_PROTOTYPES(12, sse4);
>  
> +///////////////////////////////////////////////////////////////////////////////
> +// TRANSFORM_ADD
> +///////////////////////////////////////////////////////////////////////////////
> +void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +
> +void ff_hevc_transform_add8_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add16_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
> +void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
> +
>  #endif // AVCODEC_X86_HEVCDSP_H
> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
> index 828c081..0f3ad7c 100644
> --- a/libavcodec/x86/hevcdsp_init.c
> +++ b/libavcodec/x86/hevcdsp_init.c
> @@ -469,6 +469,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
>          if (EXTERNAL_MMXEXT(cpu_flags)) {
>              c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
>              c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
> +            c->transform_add[0]    =  ff_hevc_transform_add4_8_mmxext;
>          }
>          if (EXTERNAL_SSE2(cpu_flags)) {
>              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
> @@ -476,8 +477,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
>              if (ARCH_X86_64) {
>                  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
>                  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
> +                c->transform_add[1]    = ff_hevc_transform_add8_8_sse2;
> +                c->transform_add[2]    = ff_hevc_transform_add16_8_sse2;
> +                c->transform_add[3]    = ff_hevc_transform_add32_8_sse2;
>              }
> -
>              c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
>              c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
>              c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
> @@ -512,6 +515,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
>          }
>      } else if (bit_depth == 10) {
>          if (EXTERNAL_MMXEXT(cpu_flags)) {
> +            c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
>              c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
>              c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
>          }
> @@ -521,6 +525,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
>              if (ARCH_X86_64) {
>                  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
>                  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
> +                c->transform_add[1]    = ff_hevc_transform_add8_8_sse2;
> +                c->transform_add[2]    = ff_hevc_transform_add16_8_sse2;
> +                c->transform_add[3]    = ff_hevc_transform_add32_8_sse2;

You're using the 8bit functions instead of 10bit here.

>              }
>  
>              c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
> @@ -548,11 +555,19 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
>              if (ARCH_X86_64) {
>                  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
>                  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
> +                c->transform_add[1] =  ff_hevc_transform_add8_10_avx;
> +                c->transform_add[2] = ff_hevc_transform_add16_10_avx;
> +                c->transform_add[3] = ff_hevc_transform_add32_10_avx;
>              }
>          }
>          if (EXTERNAL_AVX2(cpu_flags)) {
> +
>              c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
>              c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
> +            if (ARCH_X86_64) {
> +                c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
> +                c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
> +            }
>  
>          }
>      } else if (bit_depth == 12) {

I suppose adding 12bit versions should be pretty easy, but no hurry since i doubt there are 
any such files out there outside the conformance suit :P



More information about the ffmpeg-devel mailing list