[FFmpeg-cvslog] Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942'

Clément Bœsch git at videolan.org
Tue Jan 31 17:54:34 EET 2017


ffmpeg | branch: master | Clément Bœsch <cboesch at gopro.com> | Tue Jan 31 16:50:21 2017 +0100| [78d16eb45217f7ce811d1b05afe56427dd40021b] | committer: Clément Bœsch

Merge commit 'fca3c3b61952aacc45e9ca54d86a762946c21942'

* commit 'fca3c3b61952aacc45e9ca54d86a762946c21942':
  hevc: Add AVX2 DC IDCT

Mostly noop as we already have that code.

In the ASM, code is merged with the exception of SECTION which is kept
uppercase for consistency with the rest of the codebase.

Still in the ASM, the prototype comment is fixed to honor the '_' added
from the original commit.

idct_dc_proto() is dropped as it's not used anymore here.

Merged-by: Clément Bœsch <cboesch at gopro.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=78d16eb45217f7ce811d1b05afe56427dd40021b
---

 libavcodec/x86/hevc_idct.asm  | 69 ++++++++++++++++++++++---------------------
 libavcodec/x86/hevcdsp.h      |  3 --
 libavcodec/x86/hevcdsp_init.c | 48 +++++++++++++++---------------
 3 files changed, 59 insertions(+), 61 deletions(-)

diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index 2edaf9a..33b437c 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -1,37 +1,38 @@
-; /*
-; * SIMD optimized idct functions for HEVC decoding
-; * Copyright (c) 2014 Pierre-Edouard LEPERE
-; * Copyright (c) 2014 James Almer
-; *
-; * This file is part of FFmpeg.
-; *
-; * FFmpeg is free software; you can redistribute it and/or
-; * modify it under the terms of the GNU Lesser General Public
-; * License as published by the Free Software Foundation; either
-; * version 2.1 of the License, or (at your option) any later version.
-; *
-; * FFmpeg is distributed in the hope that it will be useful,
-; * but WITHOUT ANY WARRANTY; without even the implied warranty of
-; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-; * Lesser General Public License for more details.
-; *
-; * You should have received a copy of the GNU Lesser General Public
-; * License along with FFmpeg; if not, write to the Free Software
-; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-; */
+;*******************************************************************************
+;* SIMD-optimized IDCT functions for HEVC decoding
+;* Copyright (c) 2014 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
 %include "libavutil/x86/x86util.asm"
 
 SECTION .text
 
-; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
 ; %1 = HxW
 ; %2 = number of loops
 ; %3 = bitdepth
 %macro IDCT_DC 3
-cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
-    movsx             tmpq, word [coeffq]
-    add               tmpw, ((1 << 14-%3) + 1)
-    sar               tmpw, (15-%3)
+cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
+    movsx             tmpd, word [coeffq]
+    add               tmpd, (1 << (14 - %3)) + 1
+    sar               tmpd, (15 - %3)
     movd               xm0, tmpd
     SPLATW              m0, xm0
     DEFINE_ARGS coeff, cnt
@@ -41,11 +42,11 @@ cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
     mova [coeffq+mmsize*1], m0
     mova [coeffq+mmsize*2], m0
     mova [coeffq+mmsize*3], m0
-    mova [coeffq+mmsize*4], m0
-    mova [coeffq+mmsize*5], m0
-    mova [coeffq+mmsize*6], m0
-    mova [coeffq+mmsize*7], m0
     add  coeffq, mmsize*8
+    mova [coeffq+mmsize*-4], m0
+    mova [coeffq+mmsize*-3], m0
+    mova [coeffq+mmsize*-2], m0
+    mova [coeffq+mmsize*-1], m0
     dec  cntd
     jg  .loop
     RET
@@ -54,10 +55,10 @@ cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
 ; %1 = HxW
 ; %2 = bitdepth
 %macro IDCT_DC_NL 2 ; No loop
-cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
-    movsx             tmpq, word [coeffq]
-    add               tmpw, ((1 << 14-%2) + 1)
-    sar               tmpw, (15-%2)
+cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
+    movsx             tmpd, word [coeffq]
+    add               tmpd, (1 << (14 - %2)) + 1
+    sar               tmpd, (15 - %2)
     movd                m0, tmpd
     SPLATW              m0, xm0
     mova [coeffq+mmsize*0], m0
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 3cfdc27..63a148e 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -29,9 +29,6 @@
 #include <stdint.h>
 
 
-#define idct_dc_proto(size, bitd, opt) \
-                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-
 #define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index da73d76..d16e59d 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -59,9 +59,9 @@ LFL_FUNCS(uint8_t,  10, avx)
 LFL_FUNCS(uint8_t,  12, avx)
 
 #define IDCT_FUNCS(W, opt) \
-void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
-void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \
-void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs)
+void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
+void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
+void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
 
 IDCT_FUNCS(4x4,   mmxext);
 IDCT_FUNCS(8x8,   mmxext);
@@ -698,8 +698,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 
     if (bit_depth == 8) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
-            c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
-            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
+            c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
             c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
@@ -712,9 +712,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             }
             SAO_BAND_INIT(8, sse2);
 
-            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
-            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
-            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
 
             c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
             c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
@@ -757,8 +757,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
-            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
             if (ARCH_X86_64) {
                 c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
                 c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
@@ -855,8 +855,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
-            c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
-            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
+            c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
@@ -868,9 +868,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             SAO_BAND_INIT(10, sse2);
             SAO_EDGE_INIT(10, sse2);
 
-            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
-            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
-            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
 
             c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
             c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
@@ -904,8 +904,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
-            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
             if (ARCH_X86_64) {
                 c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
                 c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
@@ -1059,8 +1059,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         }
     } else if (bit_depth == 12) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
-            c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
-            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
+            c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
@@ -1072,9 +1072,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             SAO_BAND_INIT(12, sse2);
             SAO_EDGE_INIT(12, sse2);
 
-            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
-            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
-            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
+            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
@@ -1104,8 +1104,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
-            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
+            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
+            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
 
             SAO_BAND_INIT(12, avx2);
             SAO_EDGE_INIT(12, avx2);


======================================================================

diff --cc libavcodec/x86/hevc_idct.asm
index 2edaf9a,d662aa9..33b437c
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@@ -1,29 -1,30 +1,30 @@@
- ; /*
- ; * SIMD optimized idct functions for HEVC decoding
- ; * Copyright (c) 2014 Pierre-Edouard LEPERE
- ; * Copyright (c) 2014 James Almer
- ; *
- ; * This file is part of FFmpeg.
- ; *
- ; * FFmpeg is free software; you can redistribute it and/or
- ; * modify it under the terms of the GNU Lesser General Public
- ; * License as published by the Free Software Foundation; either
- ; * version 2.1 of the License, or (at your option) any later version.
- ; *
- ; * FFmpeg is distributed in the hope that it will be useful,
- ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
- ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- ; * Lesser General Public License for more details.
- ; *
- ; * You should have received a copy of the GNU Lesser General Public
- ; * License along with FFmpeg; if not, write to the Free Software
- ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ; */
+ ;*******************************************************************************
+ ;* SIMD-optimized IDCT functions for HEVC decoding
+ ;* Copyright (c) 2014 Pierre-Edouard LEPERE
+ ;* Copyright (c) 2014 James Almer
+ ;*
 -;* This file is part of Libav.
++;* This file is part of FFmpeg.
+ ;*
 -;* Libav is free software; you can redistribute it and/or
++;* FFmpeg is free software; you can redistribute it and/or
+ ;* modify it under the terms of the GNU Lesser General Public
+ ;* License as published by the Free Software Foundation; either
+ ;* version 2.1 of the License, or (at your option) any later version.
+ ;*
 -;* Libav is distributed in the hope that it will be useful,
++;* FFmpeg is distributed in the hope that it will be useful,
+ ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ ;* Lesser General Public License for more details.
+ ;*
+ ;* You should have received a copy of the GNU Lesser General Public
 -;* License along with Libav; if not, write to the Free Software
++;* License along with FFmpeg; if not, write to the Free Software
+ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ;******************************************************************************
+ 
  %include "libavutil/x86/x86util.asm"
  
 -section .text
 +SECTION .text
  
--; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
++; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
  ; %1 = HxW
  ; %2 = number of loops
  ; %3 = bitdepth
diff --cc libavcodec/x86/hevcdsp.h
index 3cfdc27,0000000..63a148e
mode 100644,000000..100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@@ -1,261 -1,0 +1,258 @@@
 +/*
 + * HEVC video decoder
 + *
 + * Copyright (C) 2012 - 2013 Guillaume Martres
 + * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
 + *
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_X86_HEVCDSP_H
 +#define AVCODEC_X86_HEVCDSP_H
 +
 +#include <stddef.h>
 +#include <stdint.h>
 +
 +
- #define idct_dc_proto(size, bitd, opt) \
-                 void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- 
 +#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 +dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 +dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
 +dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
 +dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
 +dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
 +
 +
 +#define PEL_PROTOTYPE(name, D, opt) \
 +void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
 +void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 +
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// MC functions
 +///////////////////////////////////////////////////////////////////////////////
 +
 +#define EPEL_PROTOTYPES(fname, bitd, opt) \
 +        PEL_PROTOTYPE(fname##4,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##6,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##8,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##12, bitd, opt); \
 +        PEL_PROTOTYPE(fname##16, bitd, opt); \
 +        PEL_PROTOTYPE(fname##24, bitd, opt); \
 +        PEL_PROTOTYPE(fname##32, bitd, opt); \
 +        PEL_PROTOTYPE(fname##48, bitd, opt); \
 +        PEL_PROTOTYPE(fname##64, bitd, opt)
 +
 +#define QPEL_PROTOTYPES(fname, bitd, opt) \
 +        PEL_PROTOTYPE(fname##4,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##8,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##12, bitd, opt); \
 +        PEL_PROTOTYPE(fname##16, bitd, opt); \
 +        PEL_PROTOTYPE(fname##24, bitd, opt); \
 +        PEL_PROTOTYPE(fname##32, bitd, opt); \
 +        PEL_PROTOTYPE(fname##48, bitd, opt); \
 +        PEL_PROTOTYPE(fname##64, bitd, opt)
 +
 +#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
 +void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
 +void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
 +
 +#define WEIGHTING_PROTOTYPES(bitd, opt) \
 +        WEIGHTING_PROTOTYPE(2, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(4, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(6, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(8, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(12, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(16, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(24, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(32, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(48, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(64, bitd, opt)
 +
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// QPEL_PIXELS EPEL_PIXELS
 +///////////////////////////////////////////////////////////////////////////////
 +EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
 +EPEL_PROTOTYPES(pel_pixels , 10, sse4);
 +EPEL_PROTOTYPES(pel_pixels , 12, sse4);
 +
 +void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +
 +void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +
 +
 +
 +void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
 +void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
 +
 +
 +void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +
 +void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// EPEL
 +///////////////////////////////////////////////////////////////////////////////
 +EPEL_PROTOTYPES(epel_h ,  8, sse4);
 +EPEL_PROTOTYPES(epel_h , 10, sse4);
 +EPEL_PROTOTYPES(epel_h , 12, sse4);
 +
 +EPEL_PROTOTYPES(epel_v ,  8, sse4);
 +EPEL_PROTOTYPES(epel_v , 10, sse4);
 +EPEL_PROTOTYPES(epel_v , 12, sse4);
 +
 +EPEL_PROTOTYPES(epel_hv ,  8, sse4);
 +EPEL_PROTOTYPES(epel_hv , 10, sse4);
 +EPEL_PROTOTYPES(epel_hv , 12, sse4);
 +
 +PEL_PROTOTYPE(epel_h16, 8, avx2);
 +PEL_PROTOTYPE(epel_h24, 8, avx2);
 +PEL_PROTOTYPE(epel_h32, 8, avx2);
 +PEL_PROTOTYPE(epel_h48, 8, avx2);
 +PEL_PROTOTYPE(epel_h64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_h16,10, avx2);
 +PEL_PROTOTYPE(epel_h24,10, avx2);
 +PEL_PROTOTYPE(epel_h32,10, avx2);
 +PEL_PROTOTYPE(epel_h48,10, avx2);
 +PEL_PROTOTYPE(epel_h64,10, avx2);
 +
 +PEL_PROTOTYPE(epel_v16, 8, avx2);
 +PEL_PROTOTYPE(epel_v24, 8, avx2);
 +PEL_PROTOTYPE(epel_v32, 8, avx2);
 +PEL_PROTOTYPE(epel_v48, 8, avx2);
 +PEL_PROTOTYPE(epel_v64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_v16,10, avx2);
 +PEL_PROTOTYPE(epel_v24,10, avx2);
 +PEL_PROTOTYPE(epel_v32,10, avx2);
 +PEL_PROTOTYPE(epel_v48,10, avx2);
 +PEL_PROTOTYPE(epel_v64,10, avx2);
 +
 +PEL_PROTOTYPE(epel_hv16, 8, avx2);
 +PEL_PROTOTYPE(epel_hv24, 8, avx2);
 +PEL_PROTOTYPE(epel_hv32, 8, avx2);
 +PEL_PROTOTYPE(epel_hv48, 8, avx2);
 +PEL_PROTOTYPE(epel_hv64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_hv16,10, avx2);
 +PEL_PROTOTYPE(epel_hv24,10, avx2);
 +PEL_PROTOTYPE(epel_hv32,10, avx2);
 +PEL_PROTOTYPE(epel_hv48,10, avx2);
 +PEL_PROTOTYPE(epel_hv64,10, avx2);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// QPEL
 +///////////////////////////////////////////////////////////////////////////////
 +QPEL_PROTOTYPES(qpel_h ,  8, sse4);
 +QPEL_PROTOTYPES(qpel_h , 10, sse4);
 +QPEL_PROTOTYPES(qpel_h , 12, sse4);
 +
 +QPEL_PROTOTYPES(qpel_v,  8, sse4);
 +QPEL_PROTOTYPES(qpel_v, 10, sse4);
 +QPEL_PROTOTYPES(qpel_v, 12, sse4);
 +
 +QPEL_PROTOTYPES(qpel_hv,  8, sse4);
 +QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 +QPEL_PROTOTYPES(qpel_hv, 12, sse4);
 +
 +PEL_PROTOTYPE(qpel_h16, 8, avx2);
 +PEL_PROTOTYPE(qpel_h24, 8, avx2);
 +PEL_PROTOTYPE(qpel_h32, 8, avx2);
 +PEL_PROTOTYPE(qpel_h48, 8, avx2);
 +PEL_PROTOTYPE(qpel_h64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_h16,10, avx2);
 +PEL_PROTOTYPE(qpel_h24,10, avx2);
 +PEL_PROTOTYPE(qpel_h32,10, avx2);
 +PEL_PROTOTYPE(qpel_h48,10, avx2);
 +PEL_PROTOTYPE(qpel_h64,10, avx2);
 +
 +PEL_PROTOTYPE(qpel_v16, 8, avx2);
 +PEL_PROTOTYPE(qpel_v24, 8, avx2);
 +PEL_PROTOTYPE(qpel_v32, 8, avx2);
 +PEL_PROTOTYPE(qpel_v48, 8, avx2);
 +PEL_PROTOTYPE(qpel_v64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_v16,10, avx2);
 +PEL_PROTOTYPE(qpel_v24,10, avx2);
 +PEL_PROTOTYPE(qpel_v32,10, avx2);
 +PEL_PROTOTYPE(qpel_v48,10, avx2);
 +PEL_PROTOTYPE(qpel_v64,10, avx2);
 +
 +PEL_PROTOTYPE(qpel_hv16, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv24, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv32, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv48, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_hv16,10, avx2);
 +PEL_PROTOTYPE(qpel_hv24,10, avx2);
 +PEL_PROTOTYPE(qpel_hv32,10, avx2);
 +PEL_PROTOTYPE(qpel_hv48,10, avx2);
 +PEL_PROTOTYPE(qpel_hv64,10, avx2);
 +
 +WEIGHTING_PROTOTYPES(8, sse4);
 +WEIGHTING_PROTOTYPES(10, sse4);
 +WEIGHTING_PROTOTYPES(12, sse4);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// TRANSFORM_ADD
 +///////////////////////////////////////////////////////////////////////////////
 +void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +#endif // AVCODEC_X86_HEVCDSP_H
diff --cc libavcodec/x86/hevcdsp_init.c
index da73d76,1a675ab..d16e59d
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@@ -34,34 -32,43 +34,34 @@@ void ff_hevc_ ## DIR ## _loop_filter_ch
  #define LFL_FUNC(DIR, DEPTH, OPT) \
  void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
  
 -#define LFC_FUNCS(type, depth) \
 -    LFC_FUNC(h, depth, sse2)   \
 -    LFC_FUNC(v, depth, sse2)
 -
 -#define LFL_FUNCS(type, depth) \
 -    LFL_FUNC(h, depth, ssse3)  \
 -    LFL_FUNC(v, depth, ssse3)
 -
 -LFC_FUNCS(uint8_t, 8)
 -LFC_FUNCS(uint8_t, 10)
 -LFL_FUNCS(uint8_t, 8)
 -LFL_FUNCS(uint8_t, 10)
 -
 -#define idct_dc_proto(size, bitd, opt) \
 -                void ff_hevc_idct_ ## size ## _dc_add_ ## bitd ## _ ## opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 +#define LFC_FUNCS(type, depth, opt) \
 +    LFC_FUNC(h, depth, opt)  \
 +    LFC_FUNC(v, depth, opt)
  
 -idct_dc_proto(4, 8,mmxext);
 -idct_dc_proto(8, 8,mmxext);
 -idct_dc_proto(16,8,  sse2);
 -idct_dc_proto(32,8,  sse2);
 +#define LFL_FUNCS(type, depth, opt) \
 +    LFL_FUNC(h, depth, opt)  \
 +    LFL_FUNC(v, depth, opt)
  
 -idct_dc_proto(32,8,  avx2);
 -
 -idct_dc_proto(4, 10,mmxext);
 -idct_dc_proto(8, 10,  sse2);
 -idct_dc_proto(16,10,  sse2);
 -idct_dc_proto(32,10,  sse2);
 -idct_dc_proto(8, 10,   avx);
 -idct_dc_proto(16,10,   avx);
 -idct_dc_proto(32,10,   avx);
 -
 -idct_dc_proto(16,10,  avx2);
 -idct_dc_proto(32,10,  avx2);
 +LFC_FUNCS(uint8_t,   8, sse2)
 +LFC_FUNCS(uint8_t,  10, sse2)
 +LFC_FUNCS(uint8_t,  12, sse2)
 +LFC_FUNCS(uint8_t,   8, avx)
 +LFC_FUNCS(uint8_t,  10, avx)
 +LFC_FUNCS(uint8_t,  12, avx)
 +LFL_FUNCS(uint8_t,   8, sse2)
 +LFL_FUNCS(uint8_t,  10, sse2)
 +LFL_FUNCS(uint8_t,  12, sse2)
 +LFL_FUNCS(uint8_t,   8, ssse3)
 +LFL_FUNCS(uint8_t,  10, ssse3)
 +LFL_FUNCS(uint8_t,  12, ssse3)
 +LFL_FUNCS(uint8_t,   8, avx)
 +LFL_FUNCS(uint8_t,  10, avx)
 +LFL_FUNCS(uint8_t,  12, avx)
  
  #define IDCT_FUNCS(W, opt) \
- void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
- void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \
- void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs)
+ void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
 -void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs)
++void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
++void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
  
  IDCT_FUNCS(4x4,   mmxext);
  IDCT_FUNCS(8x8,   mmxext);
@@@ -696,419 -240,126 +696,419 @@@ void ff_hevc_dsp_init_x86(HEVCDSPContex
  {
      int cpu_flags = av_get_cpu_flags();
  
 -#define SET_LUMA_FUNCS(tabname, funcname, depth, cf)      \
 -    c->tabname[0] = funcname ## _4_  ## depth ## _ ## cf; \
 -    c->tabname[1] = funcname ## _8_  ## depth ## _ ## cf; \
 -    c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
 -    c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
 -    c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
 -    c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
 -    c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
 -    c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
 -
 -#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf)    \
 -    c->tabname[1] = funcname ## _4_  ## depth ## _ ## cf; \
 -    c->tabname[3] = funcname ## _8_  ## depth ## _ ## cf; \
 -    c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
 -    c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
 -    c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
 -    c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
 -
 -#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS  (put_hevc_qpel[v][h], name, depth, cf)
 -#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
 -
      if (bit_depth == 8) {
          if (EXTERNAL_MMXEXT(cpu_flags)) {
-             c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
+             c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
+             c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
 +            c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
          }
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
 +
 +            }
 +            SAO_BAND_INIT(8, sse2);
  
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
+             c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
+             c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
+             c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
 -            SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
  
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
 +            c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
 +            c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
 +            c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
          }
          if (EXTERNAL_SSSE3(cpu_flags)) {
 -            SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
 -            SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
 +            if(ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +            }
 +            SAO_EDGE_INIT(8, ssse3);
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
 +        }
 +        if (EXTERNAL_AVX(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
 +            }
 +            SAO_BAND_INIT(8, avx);
 +
 +            c->add_residual[1] = ff_hevc_add_residual8_8_avx;
 +            c->add_residual[2] = ff_hevc_add_residual16_8_avx;
 +            c->add_residual[3] = ff_hevc_add_residual32_8_avx;
 +        }
 +        if (EXTERNAL_AVX2(cpu_flags)) {
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
 +            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
++            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
++            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
  
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
 +            }
 +            SAO_BAND_INIT(8, avx2);
 +
 +            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
 +            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
 +            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
 +
 +            c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
          }
      } else if (bit_depth == 10) {
          if (EXTERNAL_MMXEXT(cpu_flags)) {
 +            c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
-             c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
+             c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
+             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
          }
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
 +            }
 +            SAO_BAND_INIT(10, sse2);
 +            SAO_EDGE_INIT(10, sse2);
  
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
+             c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
+             c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
+             c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
  
 -            SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 -
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
 +            c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
 +            c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
 +            c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
          }
 -    }
 -
 -#if ARCH_X86_64
 -    if (bit_depth == 8) {
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
          }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
  
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
          }
 -
          if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
 +            }
 +            SAO_BAND_INIT(10, avx);
          }
          if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
 -            c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
          }
 -    } else if (bit_depth == 10) {
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
++            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
++            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
 +                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
 +                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
 +                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
 +                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
 +                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
 +                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
 +            }
 +            SAO_BAND_INIT(10, avx2);
 +            SAO_EDGE_INIT(10, avx2);
 +
 +            c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
 +            c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
 +
          }
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
 +    } else if (bit_depth == 12) {
 +        if (EXTERNAL_MMXEXT(cpu_flags)) {
-             c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
++            c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
++            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
 +        }
 +        if (EXTERNAL_SSE2(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
 +            }
 +            SAO_BAND_INIT(12, sse2);
 +            SAO_EDGE_INIT(12, sse2);
 +
-             c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
++            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
++            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
++            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
 +        }
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
          }
          if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
 -            SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
 -            SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
 +            }
 +            SAO_BAND_INIT(12, avx);
          }
          if (EXTERNAL_AVX2(cpu_flags)) {
 -            c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
 -            c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-             c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
-             c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
++            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
++            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
 +
 +            SAO_BAND_INIT(12, avx2);
 +            SAO_EDGE_INIT(12, avx2);
          }
      }
 -#endif /* ARCH_X86_64 */
  }



More information about the ffmpeg-cvslog mailing list