[FFmpeg-cvslog] Merge commit '1bd890ad173d79e7906c5e1d06bf0a06cca4519d'

Tue Jan 31 16:39:46 EET 2017

ffmpeg | branch: master | Clément Bœsch <cboesch at gopro.com> | Tue Jan 31 11:20:54 2017 +0100| [d0e132bab68073588dc55844a31b053fb0ee1c83] | committer: Clément Bœsch

Merge commit '1bd890ad173d79e7906c5e1d06bf0a06cca4519d'

* commit '1bd890ad173d79e7906c5e1d06bf0a06cca4519d':
  hevc: Separate adding residual to prediction from IDCT

This commit should be a noop but isn't because of the following renames:

- transform_add  → add_residual
- transform_skip → dequant
- idct_4x4_luma  → transform_4x4_luma

Merged-by: Clément Bœsch <cboesch at gopro.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d0e132bab68073588dc55844a31b053fb0ee1c83
---

 libavcodec/arm/hevcdsp_idct_neon.S  |  8 ++++----
 libavcodec/arm/hevcdsp_init_neon.c  | 26 ++++++++++++-------------
 libavcodec/hevc.c                   |  4 ++--
 libavcodec/hevc_cabac.c             |  6 +++---
 libavcodec/hevcdsp.c                | 12 ++++++------
 libavcodec/hevcdsp.h                |  6 +++---
 libavcodec/hevcdsp_template.c       | 39 ++++++++++++++++---------------------
 libavcodec/mips/hevcdsp_init_mips.c | 10 +++++-----
 libavcodec/x86/hevc_res_add.asm     | 36 +++++++++++++++++-----------------
 libavcodec/x86/hevcdsp.h            | 28 +++++++++++++-------------
 libavcodec/x86/hevcdsp_init.c       | 28 +++++++++++++-------------
 11 files changed, 99 insertions(+), 104 deletions(-)

diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
index 13d540e..e39d006 100644
--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -97,7 +97,7 @@ function ff_hevc_idct_32x32_dc_neon_8, export=1
         bx lr
 endfunc
 
-function ff_hevc_transform_add_4x4_neon_8, export=1
+function ff_hevc_add_residual_4x4_neon_8, export=1
         vldm        r1, {q0-q1}
         vld1.32     d4[0], [r0], r2
         vld1.32     d4[1], [r0], r2
@@ -117,7 +117,7 @@ function ff_hevc_transform_add_4x4_neon_8, export=1
         bx          lr
 endfunc
 
-function ff_hevc_transform_add_8x8_neon_8, export=1
+function ff_hevc_add_residual_8x8_neon_8, export=1
         mov         r3,   #8
 1:      subs        r3,   #1
         vld1.16     {q0}, [r1]!
@@ -130,7 +130,7 @@ function ff_hevc_transform_add_8x8_neon_8, export=1
         bx          lr
 endfunc
 
-function ff_hevc_transform_add_16x16_neon_8, export=1
+function ff_hevc_add_residual_16x16_neon_8, export=1
         mov         r3,   #16
 1:      subs        r3,   #1
         vld1.16     {q0, q1}, [r1]!
@@ -146,7 +146,7 @@ function ff_hevc_transform_add_16x16_neon_8, export=1
         bx          lr
 endfunc
 
-function ff_hevc_transform_add_32x32_neon_8, export=1
+function ff_hevc_add_residual_32x32_neon_8, export=1
         mov         r3,   #32
 1:      subs        r3,   #1
         vldm        r1!, {q0-q3}
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
index 5591807..1a3912c 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -34,14 +34,14 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
 void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
 void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
-void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
 
 #define PUT_PIXELS(name) \
     void name(int16_t *dst, uint8_t *src, \
@@ -156,11 +156,11 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
-        c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
-        c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
-        c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-        c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
-        c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_neon_8;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_neon_8;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_8;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_8;
+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index 607a8da..505249e 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -1052,7 +1052,7 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                         for (i = 0; i < (size * size); i++) {
                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
                         }
-                        s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+                        s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
                     }
             }
 
@@ -1081,7 +1081,7 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                         for (i = 0; i < (size * size); i++) {
                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
                         }
-                        s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+                        s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
                     }
             }
         } else if (s->ps.sps->chroma_format_idc && blk_idx == 3) {
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index 05b2821..457e087 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -1476,7 +1476,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
             }
 
-            s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+            s->hevcdsp.dequant(coeffs, log2_trafo_size);
 
             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
                                         lc->cu.pred_mode == MODE_INTRA &&
@@ -1486,7 +1486,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
             }
         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-            s->hevcdsp.idct_4x4_luma(coeffs);
+            s->hevcdsp.transform_4x4_luma(coeffs);
         } else {
             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
             if (max_xy == 0)
@@ -1510,7 +1510,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
         }
     }
-    s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+    s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
 }
 
 void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 9d773d9..23e923f 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -195,13 +195,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
 
 #define HEVC_DSP(depth)                                                     \
     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
-    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
-    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
-    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
-    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
+    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
+    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
+    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
+    hevcdsp->dequant                = FUNC(dequant, depth);                 \
     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
     hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
     hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 9f1f6dd..3b7e737 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -46,13 +46,13 @@ typedef struct HEVCDSPContext {
     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                     struct GetBitContext *gb, int pcm_bit_depth);
 
-    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+    void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
 
-    void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+    void (*dequant)(int16_t *coeffs, int16_t log2_size);
 
     void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
 
-    void (*idct_4x4_luma)(int16_t *coeffs);
+    void (*transform_4x4_luma)(int16_t *coeffs);
 
     void (*idct[4])(int16_t *coeffs, int col_limit);
 
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 78eb354..66b1ac0 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -42,8 +42,8 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
     }
 }
 
-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
-                                                     ptrdiff_t stride, int size)
+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
+                                                ptrdiff_t stride, int size)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
@@ -52,35 +52,35 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
 
     for (y = 0; y < size; y++) {
         for (x = 0; x < size; x++) {
-            dst[x] = av_clip_pixel(dst[x] + *coeffs);
-            coeffs++;
+            dst[x] = av_clip_pixel(dst[x] + *res);
+            res++;
         }
         dst += stride;
     }
 }
 
-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride)
+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
+                                  ptrdiff_t stride)
 {
-    FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+    FUNC(add_residual)(_dst, res, stride, 4);
 }
 
-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride)
+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
+                                  ptrdiff_t stride)
 {
-    FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
+    FUNC(add_residual)(_dst, res, stride, 8);
 }
 
-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
-                                         ptrdiff_t stride)
+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
+                                    ptrdiff_t stride)
 {
-    FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
+    FUNC(add_residual)(_dst, res, stride, 16);
 }
 
-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
-                                         ptrdiff_t stride)
+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
+                                    ptrdiff_t stride)
 {
-    FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+    FUNC(add_residual)(_dst, res, stride, 32);
 }
 
 
@@ -106,13 +106,11 @@ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
     }
 }
 
-static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
 {
     int shift  = 15 - BIT_DEPTH - log2_size;
     int x, y;
     int size = 1 << log2_size;
-    int16_t *coeffs = _coeffs;
-
 
     if (shift > 0) {
         int offset = 1 << (shift - 1);
@@ -134,8 +132,6 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
 
 #define SET(dst, x)   (dst) = (x)
 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
-#define ADD_AND_SCALE(dst, x)                                           \
-    (dst) = av_clip_pixel((dst) + av_clip_int16(((x) + add) >> shift))
 
 #define TR_4x4_LUMA(dst, src, step, assign)                             \
     do {                                                                \
@@ -299,7 +295,6 @@ IDCT_DC(32)
 
 #undef SET
 #undef SCALE
-#undef ADD_AND_SCALE
 
 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
diff --git a/libavcodec/mips/hevcdsp_init_mips.c b/libavcodec/mips/hevcdsp_init_mips.c
index 3675b93..776d13e 100644
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -437,11 +437,11 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
         c->idct_dc[1] = ff_hevc_idct_dc_8x8_msa;
         c->idct_dc[2] = ff_hevc_idct_dc_16x16_msa;
         c->idct_dc[3] = ff_hevc_idct_dc_32x32_msa;
-        c->transform_add[0] = ff_hevc_addblk_4x4_msa;
-        c->transform_add[1] = ff_hevc_addblk_8x8_msa;
-        c->transform_add[2] = ff_hevc_addblk_16x16_msa;
-        c->transform_add[3] = ff_hevc_addblk_32x32_msa;
-        c->idct_4x4_luma = ff_hevc_idct_luma_4x4_msa;
+        c->add_residual[0] = ff_hevc_addblk_4x4_msa;
+        c->add_residual[1] = ff_hevc_addblk_8x8_msa;
+        c->add_residual[2] = ff_hevc_addblk_16x16_msa;
+        c->add_residual[3] = ff_hevc_addblk_32x32_msa;
+        c->transform_4x4_luma = ff_hevc_idct_luma_4x4_msa;
     }
 }
 #endif  // #if HAVE_MSA
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
index dc3e88a..869288f 100644
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -1,5 +1,5 @@
 ; /*
-; * Provide SIMD optimizations for transform_add functions for HEVC decoding
+; * Provide SIMD optimizations for add_residual functions for HEVC decoding
 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
 ; *
 ; * This file is part of FFmpeg.
@@ -52,7 +52,7 @@ cextern pw_1023
 
 INIT_MMX mmxext
 ; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add4_8, 3, 4, 6
+cglobal hevc_add_residual4_8, 3, 4, 6
     TR_ADD_MMX_4_8
     add               r1, 16
     lea               r0, [r0+r2*2]
@@ -135,8 +135,8 @@ cglobal hevc_transform_add4_8, 3, 4, 6
 
 
 %macro TRANSFORM_ADD_8 0
-; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add8_8, 3, 4, 8
+; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_add_residual8_8, 3, 4, 8
     lea               r3, [r2*3]
     TR_ADD_SSE_8_8
     add               r1, 64
@@ -144,8 +144,8 @@ cglobal hevc_transform_add8_8, 3, 4, 8
     TR_ADD_SSE_8_8
     RET
 
-; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add16_8, 3, 4, 7
+; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_add_residual16_8, 3, 4, 7
     pxor              m0, m0
     lea               r3, [r2*3]
     TR_ADD_SSE_16_32_8  0, r0,      r0+r2
@@ -158,8 +158,8 @@ cglobal hevc_transform_add16_8, 3, 4, 7
 %endrep
     RET
 
-; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add32_8, 3, 4, 7
+; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_add_residual32_8, 3, 4, 7
     pxor               m0, m0
     TR_ADD_SSE_16_32_8  0, r0,    r0+16
     TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
@@ -179,8 +179,8 @@ TRANSFORM_ADD_8
 
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add32_8, 3, 4, 7
+; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_add_residual32_8, 3, 4, 7
     pxor              m0, m0
     lea               r3, [r2*3]
     TR_ADD_SSE_16_32_8   0, r0,      r0+r2
@@ -195,7 +195,7 @@ cglobal hevc_transform_add32_8, 3, 4, 7
 %endif
 
 ;-----------------------------------------------------------------------------
-; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
 %macro TR_ADD_SSE_8_10 4
     mova              m0, [%4]
@@ -310,7 +310,7 @@ cglobal hevc_transform_add32_8, 3, 4, 7
 
 
 INIT_MMX mmxext
-cglobal hevc_transform_add4_10,3,4, 6
+cglobal hevc_add_residual4_10,3,4, 6
     pxor              m2, m2
     mova              m3, [max_pixels_10]
     TR_ADD_MMX4_10     r0, r2, r1
@@ -320,10 +320,10 @@ cglobal hevc_transform_add4_10,3,4, 6
     RET
 
 ;-----------------------------------------------------------------------------
-; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal hevc_transform_add8_10,3,4,6
+cglobal hevc_add_residual8_10,3,4,6
     pxor              m4, m4
     mova              m5, [max_pixels_10]
     lea               r3, [r2*3]
@@ -334,7 +334,7 @@ cglobal hevc_transform_add8_10,3,4,6
     TR_ADD_SSE_8_10      r0, r2, r3, r1
     RET
 
-cglobal hevc_transform_add16_10,3,4,6
+cglobal hevc_add_residual16_10,3,4,6
     pxor              m4, m4
     mova              m5, [max_pixels_10]
 
@@ -346,7 +346,7 @@ cglobal hevc_transform_add16_10,3,4,6
 %endrep
     RET
 
-cglobal hevc_transform_add32_10,3,4,6
+cglobal hevc_add_residual32_10,3,4,6
     pxor              m4, m4
     mova              m5, [max_pixels_10]
 
@@ -361,7 +361,7 @@ cglobal hevc_transform_add32_10,3,4,6
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 
-cglobal hevc_transform_add16_10,3,4,6
+cglobal hevc_add_residual16_10,3,4,6
     pxor              m4, m4
     mova              m5, [max_pixels_10]
     lea               r3, [r2*3]
@@ -374,7 +374,7 @@ cglobal hevc_transform_add16_10,3,4,6
 %endrep
     RET
 
-cglobal hevc_transform_add32_10,3,4,6
+cglobal hevc_add_residual32_10,3,4,6
     pxor              m4, m4
     mova              m5, [max_pixels_10]
 
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index ad8168f..3cfdc27 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -239,23 +239,23 @@ WEIGHTING_PROTOTYPES(12, sse4);
 ///////////////////////////////////////////////////////////////////////////////
 // TRANSFORM_ADD
 ///////////////////////////////////////////////////////////////////////////////
-void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 
-void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 
-void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 
-void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 
-void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 
 #endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 09eb06d..da73d76 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -700,7 +700,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
             c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
-            c->transform_add[0]    =  ff_hevc_transform_add4_8_mmxext;
+            c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
@@ -716,9 +716,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
             c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
 
-            c->transform_add[1]    = ff_hevc_transform_add8_8_sse2;
-            c->transform_add[2]    = ff_hevc_transform_add16_8_sse2;
-            c->transform_add[3]    = ff_hevc_transform_add32_8_sse2;
+            c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
+            c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
+            c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
             if(ARCH_X86_64) {
@@ -748,9 +748,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             }
             SAO_BAND_INIT(8, avx);
 
-            c->transform_add[1]    = ff_hevc_transform_add8_8_avx;
-            c->transform_add[2]    = ff_hevc_transform_add16_8_avx;
-            c->transform_add[3]    = ff_hevc_transform_add32_8_avx;
+            c->add_residual[1] = ff_hevc_add_residual8_8_avx;
+            c->add_residual[2] = ff_hevc_add_residual16_8_avx;
+            c->add_residual[3] = ff_hevc_add_residual32_8_avx;
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
             c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
@@ -850,11 +850,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
             c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
 
-            c->transform_add[3]    = ff_hevc_transform_add32_8_avx2;
+            c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
-            c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
+            c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
             c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
             c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
         }
@@ -872,9 +872,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
             c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
 
-            c->transform_add[1]    = ff_hevc_transform_add8_10_sse2;
-            c->transform_add[2]    = ff_hevc_transform_add16_10_sse2;
-            c->transform_add[3]    = ff_hevc_transform_add32_10_sse2;
+            c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
+            c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
+            c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@@ -1053,8 +1053,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             SAO_BAND_INIT(10, avx2);
             SAO_EDGE_INIT(10, avx2);
 
-            c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
-            c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
+            c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
+            c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
 
         }
     } else if (bit_depth == 12) {


======================================================================

diff --cc libavcodec/arm/hevcdsp_idct_neon.S
index 13d540e,0000000..e39d006
mode 100644,000000..100644
--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@@ -1,465 -1,0 +1,465 @@@
 +/*
 + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi at vtt.fi>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
 +
 +function ff_hevc_idct_4x4_dc_neon_8, export=1
 +        ldrsh       r1, [r0]
 +        ldr         r2, =0x20
 +        add         r1, #1
 +        asr         r1, #1
 +        add         r1, r2
 +        asr         r1, #6
 +        vdup.16     q0, r1
 +        vdup.16     q1, r1
 +        vst1.16     {q0, q1}, [r0]
 +        bx lr
 +endfunc
 +
 +function ff_hevc_idct_8x8_dc_neon_8, export=1
 +        ldrsh       r1, [r0]
 +        ldr         r2, =0x20
 +        add         r1, #1
 +        asr         r1, #1
 +        add         r1, r2
 +        asr         r1, #6
 +        vdup.16     q8, r1
 +        vdup.16     q9, r1
 +        vmov.16     q10, q8
 +        vmov.16     q11, q8
 +        vmov.16     q12, q8
 +        vmov.16     q13, q8
 +        vmov.16     q14, q8
 +        vmov.16     q15, q8
 +        vstm        r0, {q8-q15}
 +        bx lr
 +endfunc
 +
 +function ff_hevc_idct_16x16_dc_neon_8, export=1
 +        ldrsh       r1, [r0]
 +        ldr         r2, =0x20
 +        add         r1, #1
 +        asr         r1, #1
 +        add         r1, r2
 +        asr         r1, #6
 +        vdup.16     q8, r1
 +        vdup.16     q9, r1
 +        vmov.16     q10, q8
 +        vmov.16     q11, q8
 +        vmov.16     q12, q8
 +        vmov.16     q13, q8
 +        vmov.16     q14, q8
 +        vmov.16     q15, q8
 +        vstm        r0!, {q8-q15}
 +        vstm        r0!, {q8-q15}
 +        vstm        r0!, {q8-q15}
 +        vstm        r0, {q8-q15}
 +        bx lr
 +endfunc
 +
 +function ff_hevc_idct_32x32_dc_neon_8, export=1
 +        ldrsh       r1, [r0]
 +        ldr         r2, =0x20
 +        add         r1, #1
 +        asr         r1, #1
 +        add         r1, r2
 +        asr         r1, #6
 +        mov         r3, #16
 +        vdup.16     q8, r1
 +        vdup.16     q9, r1
 +        vmov.16     q10, q8
 +        vmov.16     q11, q8
 +        vmov.16     q12, q8
 +        vmov.16     q13, q8
 +        vmov.16     q14, q8
 +        vmov.16     q15, q8
 +1:      subs        r3, #1
 +        vstm        r0!, {q8-q15}
 +        bne         1b
 +        bx lr
 +endfunc
 +
- function ff_hevc_transform_add_4x4_neon_8, export=1
++function ff_hevc_add_residual_4x4_neon_8, export=1
 +        vldm        r1, {q0-q1}
 +        vld1.32     d4[0], [r0], r2
 +        vld1.32     d4[1], [r0], r2
 +        vld1.32     d5[0], [r0], r2
 +        vld1.32     d5[1], [r0], r2
 +        sub         r0, r0, r2, lsl #2
 +        vmovl.u8    q8, d4
 +        vmovl.u8    q9, d5
 +        vqadd.s16   q0, q0, q8
 +        vqadd.s16   q1, q1, q9
 +        vqmovun.s16 d0, q0
 +        vqmovun.s16 d1, q1
 +        vst1.32     d0[0], [r0], r2
 +        vst1.32     d0[1], [r0], r2
 +        vst1.32     d1[0], [r0], r2
 +        vst1.32     d1[1], [r0], r2
 +        bx          lr
 +endfunc
 +
- function ff_hevc_transform_add_8x8_neon_8, export=1
++function ff_hevc_add_residual_8x8_neon_8, export=1
 +        mov         r3,   #8
 +1:      subs        r3,   #1
 +        vld1.16     {q0}, [r1]!
 +        vld1.8      d16,  [r0]
 +        vmovl.u8    q8,   d16
 +        vqadd.s16   q0,   q8
 +        vqmovun.s16 d0,   q0
 +        vst1.32     d0,   [r0], r2
 +        bne         1b
 +        bx          lr
 +endfunc
 +
- function ff_hevc_transform_add_16x16_neon_8, export=1
++function ff_hevc_add_residual_16x16_neon_8, export=1
 +        mov         r3,   #16
 +1:      subs        r3,   #1
 +        vld1.16     {q0, q1}, [r1]!
 +        vld1.8      {q8},  [r0]
 +        vmovl.u8    q9,  d16
 +        vmovl.u8    q10, d17
 +        vqadd.s16   q0,  q9
 +        vqadd.s16   q1,  q10
 +        vqmovun.s16 d0,  q0
 +        vqmovun.s16 d1,  q1
 +        vst1.8      {q0},   [r0], r2
 +        bne         1b
 +        bx          lr
 +endfunc
 +
- function ff_hevc_transform_add_32x32_neon_8, export=1
++function ff_hevc_add_residual_32x32_neon_8, export=1
 +        mov         r3,   #32
 +1:      subs        r3,   #1
 +        vldm        r1!, {q0-q3}
 +        vld1.8      {q8, q9},  [r0]
 +        vmovl.u8    q10, d16
 +        vmovl.u8    q11, d17
 +        vmovl.u8    q12, d18
 +        vmovl.u8    q13, d19
 +        vqadd.s16   q0,  q10
 +        vqadd.s16   q1,  q11
 +        vqadd.s16   q2,  q12
 +        vqadd.s16   q3,  q13
 +        vqmovun.s16 d0,  q0
 +        vqmovun.s16 d1,  q1
 +        vqmovun.s16 d2,  q2
 +        vqmovun.s16 d3,  q3
 +        vst1.8     {q0, q1},   [r0], r2
 +        bne         1b
 +        bx          lr
 +endfunc
 +
 +.macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
 +        vtrn.64         \r0, \r4
 +        vtrn.64         \r1, \r5
 +        vtrn.64         \r2, \r6
 +        vtrn.64         \r3, \r7
 +        vtrn.32         \r0, \r2
 +        vtrn.32         \r1, \r3
 +        vtrn.32         \r4, \r6
 +        vtrn.32         \r5, \r7
 +        vtrn.16         \r0, \r1
 +        vtrn.16         \r2, \r3
 +        vtrn.16         \r4, \r5
 +        vtrn.16         \r6, \r7
 +.endm
 +
 +// in 4 q regs
 +// output 8 d regs
 +.macro transpose_16b_4x4    r0, r1, r2, r3
 +        vtrn.32         \r0, \r2
 +        vtrn.32         \r1, \r3
 +        vtrn.16         \r0, \r1
 +        vtrn.16         \r2, \r3
 +.endm
 +
 +/* uses registers q2 - q9 for temp values */
 +/* TODO: reorder */
 +.macro tr4_luma_shift r0, r1, r2, r3, shift
 +        vaddl.s16   q5, \r0, \r2    // c0 = src0 + src2
 +        vaddl.s16   q2, \r2, \r3    // c1 = src2 + src3
 +        vsubl.s16   q4, \r0, \r3    // c2 = src0 - src3
 +        vmull.s16   q6, \r1, d0[0]  // c3 = 74 * src1
 +
 +        vaddl.s16   q7, \r0, \r3    // src0 + src3
 +        vsubw.s16   q7, q7, \r2     // src0 - src2 + src3
 +        vmul.s32    q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)
 +
 +        vmul.s32    q8, q5, d0[1]   // 29 * c0
 +        vmul.s32    q9, q2, d1[0]   // 55 * c1
 +        vadd.s32    q8, q9          // 29 * c0 + 55 * c1
 +        vadd.s32    q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3
 +
 +        vmul.s32    q2, q2, d0[1]   // 29 * c1
 +        vmul.s32    q9, q4, d1[0]   // 55 * c2
 +        vsub.s32    q9, q2          // 55 * c2 - 29 * c1
 +        vadd.s32    q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3
 +
 +        vmul.s32    q5, q5, d1[0]   // 55 * c0
 +        vmul.s32    q4, q4, d0[1]   // 29 * c2
 +        vadd.s32    q5, q4          // 55 * c0 + 29 * c2
 +        vsub.s32    q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3
 +
 +        vqrshrn.s32   \r0, q8, \shift
 +        vqrshrn.s32   \r1, q9, \shift
 +        vqrshrn.s32   \r2, q7, \shift
 +        vqrshrn.s32   \r3, q5, \shift
 +.endm
 +
 +/* uses registers q2 - q6 for temp values */
 +.macro tr4 r0, r1, r2, r3
 +        vmull.s16  q4, \r1, d0[0]   // 83 * src1
 +        vmull.s16  q6, \r1, d0[1]   // 36 * src1
 +        vshll.s16  q2, \r0, #6   // 64 * src0
 +        vshll.s16  q3, \r2, #6   // 64 * src2
 +        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
 +        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
 +        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
 +        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
 +
 +        vsub.s32   q3, q5, q4    // e0 - o0
 +        vadd.s32   q4, q5, q4    // e0 + o0
 +        vadd.s32   q5, q2, q6    // e1 + o1
 +        vsub.s32   q6, q2, q6    // e1 - o1
 +.endm
 +
 +.macro tr4_shift r0, r1, r2, r3, shift
 +        vmull.s16  q4, \r1, d0[0]   // 83 * src1
 +        vmull.s16  q6, \r1, d0[1]   // 36 * src1
 +        vshll.s16  q2, \r0, #6   // 64 * src0
 +        vshll.s16  q3, \r2, #6   // 64 * src2
 +        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
 +        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
 +        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
 +        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
 +
 +        vsub.s32   q3, q5, q4    // e0 - o0
 +        vadd.s32   q4, q5, q4    // e0 + o0
 +        vadd.s32   q5, q2, q6    // e1 + o1
 +        vsub.s32   q6, q2, q6    // e1 - o1
 +
 +        vqrshrn.s32   \r0, q4, \shift
 +        vqrshrn.s32   \r1, q5, \shift
 +        vqrshrn.s32   \r2, q6, \shift
 +        vqrshrn.s32   \r3, q3, \shift
 +.endm
 +
 +function ff_hevc_transform_4x4_neon_8, export=1
 +        vpush       {d8-d15}
 +        vld1.16     {q14, q15}, [r0]  // coeffs
 +        ldr         r3, =0x00240053 // 36 and 83
 +        vmov.32     d0[0], r3
 +
 +        tr4_shift d28, d29, d30, d31, #7
 +
 +        vtrn.16     d28, d29
 +        vtrn.16     d30, d31
 +        vtrn.32     q14, q15
 +
 +        tr4_shift d28, d29, d30, d31, #12
 +
 +        vtrn.16     d28, d29
 +        vtrn.16     d30, d31
 +        vtrn.32     q14, q15
 +
 +        vst1.16     {q14, q15}, [r0]
 +        vpop        {d8-d15}
 +        bx lr
 +endfunc
 +
 +function ff_hevc_transform_luma_4x4_neon_8, export=1
 +        vpush       {d8-d15}
 +        vld1.16     {q14, q15}, [r0]  // coeffs
 +        ldr         r3, =0x4a  // 74
 +        vmov.32     d0[0], r3
 +        ldr         r3, =0x1d  // 29
 +        vmov.32     d0[1], r3
 +        ldr         r3, =0x37  // 55
 +        vmov.32     d1[0], r3
 +
 +        tr4_luma_shift d28, d29, d30, d31, #7
 +
 +        vtrn.16     d28, d29
 +        vtrn.16     d30, d31
 +        vtrn.32     q14, q15
 +
 +        tr4_luma_shift d28, d29, d30, d31, #12
 +
 +        vtrn.16     d28, d29
 +        vtrn.16     d30, d31
 +        vtrn.32     q14, q15
 +        vst1.16     {q14, q15}, [r0]
 +        vpop        {d8-d15}
 +        bx lr
 +endfunc
 +
 +.macro tr8_begin in0, in1, in2, in3
 +        vmull.s16  q7, \in0, d1[1]   // 89 * src1
 +        vmull.s16  q8, \in0, d1[0]   // 75 * src1
 +        vmull.s16  q9, \in0, d1[3]   // 50 * src1
 +        vmull.s16  q10, \in0, d1[2]  // 18 * src1
 +
 +        vmlal.s16  q7, \in1, d1[0]   // 75 * src3
 +        vmlsl.s16  q8, \in1, d1[2]   //-18 * src3
 +        vmlsl.s16  q9, \in1, d1[1]   //-89 * src3
 +        vmlsl.s16  q10, \in1, d1[3]  //-50 * src3
 +
 +        vmlal.s16  q7, \in2, d1[3]   // 50 * src5
 +        vmlsl.s16  q8, \in2, d1[1]   //-89 * src5
 +        vmlal.s16  q9, \in2, d1[2]   // 18 * src5
 +        vmlal.s16  q10, \in2, d1[0]  // 75 * src5
 +
 +        vmlal.s16  q7, \in3, d1[2]   // 18 * src7
 +        vmlsl.s16  q8, \in3, d1[3]   //-50 * src7
 +        vmlal.s16  q9, \in3, d1[0]   // 75 * src7
 +        vmlsl.s16  q10, \in3, d1[1]  //-89 * src7
 +.endm
 +
 +.macro tr8_end shift
 +        vadd.s32   q1, q4, q7   //  e_8[0] + o_8[0], dst[0]
 +        vsub.s32   q4, q4, q7   //  e_8[0] - o_8[0], dst[7]
 +
 +        vadd.s32   q2, q5, q8   // e_8[1] + o_8[1], dst[1]
 +        vsub.s32   q5, q5, q8   // e_8[1] - o_8[1], dst[6]
 +
 +        vadd.s32   q11, q6, q9  // e_8[2] + o_8[2], dst[2]
 +        vsub.s32    q6, q6, q9  // e_8[2] - o_8[2], dst[5]
 +
 +        vadd.s32   q12, q3, q10 // e_8[3] + o_8[3], dst[3]
 +        vsub.s32   q3, q3, q10  // e_8[3] - o_8[3], dst[4]
 +        vqrshrn.s32   d2, q1, \shift
 +        vqrshrn.s32   d3, q2, \shift
 +        vqrshrn.s32   d4, q11, \shift
 +        vqrshrn.s32   d5, q12, \shift
 +        vqrshrn.s32   d6, q3, \shift
 +        vqrshrn.s32   d7, q6, \shift
 +        vqrshrn.s32   d9, q4, \shift
 +        vqrshrn.s32   d8, q5, \shift
 +.endm
 +
 +function ff_hevc_transform_8x8_neon_8, export=1
 +        push   {r4-r8}
 +        vpush {d8-d15}
 +        mov    r5, #16
 +
 +        adr       r3, tr4f
 +        vld1.16   {d0, d1}, [r3]
 +
 +        // left half
 +        vld1.16 {d24}, [r0], r5
 +        vld1.16 {d25}, [r0], r5
 +        vld1.16 {d26}, [r0], r5
 +        vld1.16 {d27}, [r0], r5
 +        vld1.16 {d28}, [r0], r5
 +        vld1.16 {d29}, [r0], r5
 +        vld1.16 {d30}, [r0], r5
 +        vld1.16 {d31}, [r0], r5
 +        sub      r0, #128
 +        tr8_begin d25, d27, d29, d31
 +        tr4       d24, d26, d28, d30
 +        tr8_end   #7
 +        vst1.16 {d2}, [r0], r5
 +        vst1.16 {d3}, [r0], r5
 +        vst1.16 {d4}, [r0], r5
 +        vst1.16 {d5}, [r0], r5
 +        vst1.16 {d6}, [r0], r5
 +        vst1.16 {d7}, [r0], r5
 +        vst1.16 {d8}, [r0], r5
 +        vst1.16 {d9}, [r0], r5
 +        sub      r0, #128
 +        //skip right half if col_limit in r1 is less than 4
 +        cmp      r1, #4
 +        blt      1f
 +        //right half
 +        add      r0, #8
 +        vld1.16 {d24}, [r0], r5
 +        vld1.16 {d25}, [r0], r5
 +        vld1.16 {d26}, [r0], r5
 +        vld1.16 {d27}, [r0], r5
 +        vld1.16 {d28}, [r0], r5
 +        vld1.16 {d29}, [r0], r5
 +        vld1.16 {d30}, [r0], r5
 +        vld1.16 {d31}, [r0], r5
 +        sub      r0, #128
 +        tr8_begin d25, d27, d29, d31
 +        tr4       d24, d26, d28, d30
 +        tr8_end   #7
 +        vst1.16 {d2}, [r0], r5
 +        vst1.16 {d3}, [r0], r5
 +        vst1.16 {d4}, [r0], r5
 +        vst1.16 {d5}, [r0], r5
 +        vst1.16 {d6}, [r0], r5
 +        vst1.16 {d7}, [r0], r5
 +        vst1.16 {d8}, [r0], r5
 +        vst1.16 {d9}, [r0], r5
 +        sub      r0, #136
 +1:
 +        // top half
 +        vldm r0, {q12-q15} // coeffs
 +        transpose_16b_4x4 d24, d26, d28, d30
 +        transpose_16b_4x4 d25, d27, d29, d31
 +        tr8_begin d26, d30, d27, d31
 +        tr4 d24, d28, d25, d29
 +        tr8_end #12
 +        transpose_16b_4x4 d2, d3, d4, d5
 +        transpose_16b_4x4 d6, d7, d8, d9
 +        vswp     d7, d5
 +        vswp     d7, d8
 +        vswp     d3, d6
 +        vswp     d6, d4
 +        vstm r0!, {q1-q4}
 +
 +        // bottom half
 +        vldm r0, {q12-q15} // coeffs
 +        transpose_16b_4x4 d24, d26, d28, d30
 +        transpose_16b_4x4 d25, d27, d29, d31
 +        tr8_begin d26, d30, d27, d31
 +        tr4 d24, d28, d25, d29
 +        tr8_end #12
 +        transpose_16b_4x4 d2, d3, d4, d5
 +        transpose_16b_4x4 d6, d7, d8, d9
 +        vswp     d7, d5
 +        vswp     d7, d8
 +        vswp     d3, d6
 +        vswp     d6, d4
 +        //vstm     r0, {q1-q4}
 +        vst1.16 {q1-q2}, [r0]
 +        add     r0, #32
 +        vst1.16 {q3-q4}, [r0]
 +        sub     r0, #32
 +        vpop {d8-d15}
 +        pop {r4-r8}
 +        bx lr
 +endfunc
 +
 +.align 4
 +tr4f:
 +.word 0x00240053  // 36 and d1[0] = 83
 +.word 0x00000000
 +tr8f:
 +.word 0x0059004b  // 89, d0[0] = 75
 +.word 0x00320012  // 50, d0[2] = 18
 +tr16:
 +.word 0x005a0057  // 90, d2[0] = 87
 +.word 0x00500046  // 80, d2[2] = 70
 +.word 0x0039002b  // 57, d2[0] = 43
 +.word 0x00190009  // 25, d2[2] = 9
diff --cc libavcodec/arm/hevcdsp_init_neon.c
index 5591807,0000000..1a3912c
mode 100644,000000..100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@@ -1,224 -1,0 +1,224 @@@
 +/*
 + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi at vtt.fi>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "libavutil/attributes.h"
 +#include "libavutil/arm/cpu.h"
 +#include "libavcodec/hevcdsp.h"
 +#include "hevcdsp_arm.h"
 +
 +void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
 +void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
 +void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
 +void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
 +void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
 +void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
 +void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
- void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
- void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
- void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
- void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
 +
 +#define PUT_PIXELS(name) \
 +    void name(int16_t *dst, uint8_t *src, \
 +                                ptrdiff_t srcstride, int height, \
 +                                intptr_t mx, intptr_t my, int width)
 +PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
 +PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
 +#undef PUT_PIXELS
 +
 +static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
 +                                   int height, int width);
 +static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
 +void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
 +                                   int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
 +                                   int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
 +                                       int16_t *src2,
 +                                       int height, intptr_t mx, intptr_t my, int width);
 +#define QPEL_FUNC(name) \
 +    void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
 +                                   int height, int width)
 +
 +QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
 +QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
 +#undef QPEL_FUNC
 +
 +#define QPEL_FUNC_UW_PIX(name) \
 +    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
 +                                   int height, intptr_t mx, intptr_t my, int width);
 +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
 +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
 +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
 +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
 +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
 +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
 +QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
 +#undef QPEL_FUNC_UW_PIX
 +
 +#define QPEL_FUNC_UW(name) \
 +    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
 +                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
 +QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
 +#undef QPEL_FUNC_UW
 +
 +void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
 +                                   int height, intptr_t mx, intptr_t my, int width) {
 +
 +    put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
 +}
 +
 +void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
 +                                   int height, intptr_t mx, intptr_t my, int width) {
 +
 +    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
 +}
 +
 +void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
 +                                       int16_t *src2,
 +                                       int height, intptr_t mx, intptr_t my, int width) {
 +    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
 +}
 +
 +av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
 +{
 +    if (bit_depth == 8) {
 +        int x;
 +        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
 +        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
 +        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
 +        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
 +        c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
 +        c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
 +        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
 +        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
 +        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
 +        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
-         c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
-         c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
-         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
-         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
++        c->add_residual[0]             = ff_hevc_add_residual_4x4_neon_8;
++        c->add_residual[1]             = ff_hevc_add_residual_8x8_neon_8;
++        c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_8;
++        c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_8;
++        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
 +        put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
 +        put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
 +        put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
 +        put_hevc_qpel_neon[0][1]       = ff_hevc_put_qpel_h1_neon_8;
 +        put_hevc_qpel_neon[0][2]       = ff_hevc_put_qpel_h2_neon_8;
 +        put_hevc_qpel_neon[0][3]       = ff_hevc_put_qpel_h3_neon_8;
 +        put_hevc_qpel_neon[1][1]       = ff_hevc_put_qpel_h1v1_neon_8;
 +        put_hevc_qpel_neon[1][2]       = ff_hevc_put_qpel_h2v1_neon_8;
 +        put_hevc_qpel_neon[1][3]       = ff_hevc_put_qpel_h3v1_neon_8;
 +        put_hevc_qpel_neon[2][1]       = ff_hevc_put_qpel_h1v2_neon_8;
 +        put_hevc_qpel_neon[2][2]       = ff_hevc_put_qpel_h2v2_neon_8;
 +        put_hevc_qpel_neon[2][3]       = ff_hevc_put_qpel_h3v2_neon_8;
 +        put_hevc_qpel_neon[3][1]       = ff_hevc_put_qpel_h1v3_neon_8;
 +        put_hevc_qpel_neon[3][2]       = ff_hevc_put_qpel_h2v3_neon_8;
 +        put_hevc_qpel_neon[3][3]       = ff_hevc_put_qpel_h3v3_neon_8;
 +        put_hevc_qpel_uw_neon[1][0]      = ff_hevc_put_qpel_uw_v1_neon_8;
 +        put_hevc_qpel_uw_neon[2][0]      = ff_hevc_put_qpel_uw_v2_neon_8;
 +        put_hevc_qpel_uw_neon[3][0]      = ff_hevc_put_qpel_uw_v3_neon_8;
 +        put_hevc_qpel_uw_neon[0][1]      = ff_hevc_put_qpel_uw_h1_neon_8;
 +        put_hevc_qpel_uw_neon[0][2]      = ff_hevc_put_qpel_uw_h2_neon_8;
 +        put_hevc_qpel_uw_neon[0][3]      = ff_hevc_put_qpel_uw_h3_neon_8;
 +        put_hevc_qpel_uw_neon[1][1]      = ff_hevc_put_qpel_uw_h1v1_neon_8;
 +        put_hevc_qpel_uw_neon[1][2]      = ff_hevc_put_qpel_uw_h2v1_neon_8;
 +        put_hevc_qpel_uw_neon[1][3]      = ff_hevc_put_qpel_uw_h3v1_neon_8;
 +        put_hevc_qpel_uw_neon[2][1]      = ff_hevc_put_qpel_uw_h1v2_neon_8;
 +        put_hevc_qpel_uw_neon[2][2]      = ff_hevc_put_qpel_uw_h2v2_neon_8;
 +        put_hevc_qpel_uw_neon[2][3]      = ff_hevc_put_qpel_uw_h3v2_neon_8;
 +        put_hevc_qpel_uw_neon[3][1]      = ff_hevc_put_qpel_uw_h1v3_neon_8;
 +        put_hevc_qpel_uw_neon[3][2]      = ff_hevc_put_qpel_uw_h2v3_neon_8;
 +        put_hevc_qpel_uw_neon[3][3]      = ff_hevc_put_qpel_uw_h3v3_neon_8;
 +        for (x = 0; x < 10; x++) {
 +            c->put_hevc_qpel[x][1][0]         = ff_hevc_put_qpel_neon_wrapper;
 +            c->put_hevc_qpel[x][0][1]         = ff_hevc_put_qpel_neon_wrapper;
 +            c->put_hevc_qpel[x][1][1]         = ff_hevc_put_qpel_neon_wrapper;
 +            c->put_hevc_qpel_uni[x][1][0]     = ff_hevc_put_qpel_uni_neon_wrapper;
 +            c->put_hevc_qpel_uni[x][0][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
 +            c->put_hevc_qpel_uni[x][1][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
 +            c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
 +            c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
 +            c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
 +        }
 +        c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
 +        c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
 +        c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
 +        c->put_hevc_qpel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
 +        c->put_hevc_qpel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
 +        c->put_hevc_qpel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
 +        c->put_hevc_qpel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
 +        c->put_hevc_qpel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
 +        c->put_hevc_qpel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
 +        c->put_hevc_qpel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
 +
 +        c->put_hevc_qpel_uni[1][0][0]  = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
 +        c->put_hevc_qpel_uni[3][0][0]  = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
 +        c->put_hevc_qpel_uni[5][0][0]  = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
 +        c->put_hevc_qpel_uni[6][0][0]  = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
 +        c->put_hevc_qpel_uni[7][0][0]  = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
 +        c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
 +        c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
 +    }
 +}
diff --cc libavcodec/hevc.c
index 607a8da,d5d3f59..505249e
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@@ -1016,127 -1289,20 +1016,127 @@@ static int hls_transform_unit(HEVCConte
              }
          }
  
 +        lc->tu.cross_pf = 0;
 +
          if (cbf_luma)
 -            hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
 -        if (log2_trafo_size > 2) {
 -            if (cbf_cb)
 -                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 1);
 -            if (cbf_cr)
 -                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 2);
 +            ff_hevc_hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
 +        if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
 +            int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
 +            int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
 +            lc->tu.cross_pf  = (s->ps.pps->cross_component_prediction_enabled_flag && cbf_luma &&
 +                                (lc->cu.pred_mode == MODE_INTER ||
 +                                 (lc->tu.chroma_mode_c ==  4)));
 +
 +            if (lc->tu.cross_pf) {
 +                hls_cross_component_pred(s, 0);
 +            }
 +            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
 +                if (lc->cu.pred_mode == MODE_INTRA) {
 +                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
 +                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
 +                }
 +                if (cbf_cb[i])
 +                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
 +                                                log2_trafo_size_c, scan_idx_c, 1);
 +                else
 +                    if (lc->tu.cross_pf) {
 +                        ptrdiff_t stride = s->frame->linesize[1];
 +                        int hshift = s->ps.sps->hshift[1];
 +                        int vshift = s->ps.sps->vshift[1];
 +                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
 +                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
 +                        int size = 1 << log2_trafo_size_c;
 +
 +                        uint8_t *dst = &s->frame->data[1][(y0 >> vshift) * stride +
 +                                                              ((x0 >> hshift) << s->ps.sps->pixel_shift)];
 +                        for (i = 0; i < (size * size); i++) {
 +                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
 +                        }
-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
++                        s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
 +                    }
 +            }
 +
 +            if (lc->tu.cross_pf) {
 +                hls_cross_component_pred(s, 1);
 +            }
 +            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
 +                if (lc->cu.pred_mode == MODE_INTRA) {
 +                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
 +                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
 +                }
 +                if (cbf_cr[i])
 +                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
 +                                                log2_trafo_size_c, scan_idx_c, 2);
 +                else
 +                    if (lc->tu.cross_pf) {
 +                        ptrdiff_t stride = s->frame->linesize[2];
 +                        int hshift = s->ps.sps->hshift[2];
 +                        int vshift = s->ps.sps->vshift[2];
 +                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
 +                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
 +                        int size = 1 << log2_trafo_size_c;
 +
 +                        uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride +
 +                                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
 +                        for (i = 0; i < (size * size); i++) {
 +                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
 +                        }
-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
++                        s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
 +                    }
 +            }
 +        } else if (s->ps.sps->chroma_format_idc && blk_idx == 3) {
 +            int trafo_size_h = 1 << (log2_trafo_size + 1);
 +            int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
 +            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
 +                if (lc->cu.pred_mode == MODE_INTRA) {
 +                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
 +                                                    trafo_size_h, trafo_size_v);
 +                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
 +                }
 +                if (cbf_cb[i])
 +                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
 +                                                log2_trafo_size, scan_idx_c, 1);
 +            }
 +            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
 +                if (lc->cu.pred_mode == MODE_INTRA) {
 +                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
 +                                                trafo_size_h, trafo_size_v);
 +                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
 +                }
 +                if (cbf_cr[i])
 +                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
 +                                                log2_trafo_size, scan_idx_c, 2);
 +            }
 +        }
 +    } else if (s->ps.sps->chroma_format_idc && lc->cu.pred_mode == MODE_INTRA) {
 +        if (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3) {
 +            int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
 +            int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
 +            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
 +            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
 +            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
 +            if (s->ps.sps->chroma_format_idc == 2) {
 +                ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
 +                                                trafo_size_h, trafo_size_v);
 +                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
 +                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
 +            }
          } else if (blk_idx == 3) {
 -            if (cbf_cb)
 -                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 1);
 -            if (cbf_cr)
 -                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 2);
 +            int trafo_size_h = 1 << (log2_trafo_size + 1);
 +            int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
 +            ff_hevc_set_neighbour_available(s, xBase, yBase,
 +                                            trafo_size_h, trafo_size_v);
 +            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
 +            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
 +            if (s->ps.sps->chroma_format_idc == 2) {
 +                ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
 +                                                trafo_size_h, trafo_size_v);
 +                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
 +                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
 +            }
          }
      }
 +
      return 0;
  }
  
diff --cc libavcodec/hevc_cabac.c
index 05b2821,b01808f..457e087
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@@ -999,541 -867,6 +999,541 @@@ static av_always_inline int coeff_sign_
      int ret = 0;
  
      for (i = 0; i < nb; i++)
 -        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc.cc);
 +        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
      return ret;
  }
 +
 +void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
 +                                int log2_trafo_size, enum ScanType scan_idx,
 +                                int c_idx)
 +{
 +#define GET_COORD(offset, n)                                    \
 +    do {                                                        \
 +        x_c = (x_cg << 2) + scan_x_off[n];                      \
 +        y_c = (y_cg << 2) + scan_y_off[n];                      \
 +    } while (0)
 +    HEVCLocalContext *lc = s->HEVClc;
 +    int transform_skip_flag = 0;
 +
 +    int last_significant_coeff_x, last_significant_coeff_y;
 +    int last_scan_pos;
 +    int n_end;
 +    int num_coeff = 0;
 +    int greater1_ctx = 1;
 +
 +    int num_last_subset;
 +    int x_cg_last_sig, y_cg_last_sig;
 +
 +    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
 +
 +    ptrdiff_t stride = s->frame->linesize[c_idx];
 +    int hshift = s->ps.sps->hshift[c_idx];
 +    int vshift = s->ps.sps->vshift[c_idx];
 +    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
 +                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
 +    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
 +    uint8_t significant_coeff_group_flag[8][8] = {{0}};
 +    int explicit_rdpcm_flag = 0;
 +    int explicit_rdpcm_dir_flag;
 +
 +    int trafo_size = 1 << log2_trafo_size;
 +    int i;
 +    int qp,shift,add,scale,scale_m;
 +    static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
 +    const uint8_t *scale_matrix = NULL;
 +    uint8_t dc_scale;
 +    int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
 +                                         lc->tu.intra_pred_mode_c;
 +
 +    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
 +
 +    // Derive QP for dequant
 +    if (!lc->cu.cu_transquant_bypass_flag) {
 +        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
 +        static const uint8_t rem6[51 + 4 * 6 + 1] = {
 +            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
 +            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
 +            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
 +            4, 5, 0, 1, 2, 3, 4, 5, 0, 1
 +        };
 +
 +        static const uint8_t div6[51 + 4 * 6 + 1] = {
 +            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
 +            3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
 +            7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
 +            10, 10, 11, 11, 11, 11, 11, 11, 12, 12
 +        };
 +        int qp_y = lc->qp_y;
 +
 +        if (s->ps.pps->transform_skip_enabled_flag &&
 +            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
 +            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
 +        }
 +
 +        if (c_idx == 0) {
 +            qp = qp_y + s->ps.sps->qp_bd_offset;
 +        } else {
 +            int qp_i, offset;
 +
 +            if (c_idx == 1)
 +                offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset +
 +                         lc->tu.cu_qp_offset_cb;
 +            else
 +                offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset +
 +                         lc->tu.cu_qp_offset_cr;
 +
 +            qp_i = av_clip(qp_y + offset, - s->ps.sps->qp_bd_offset, 57);
 +            if (s->ps.sps->chroma_format_idc == 1) {
 +                if (qp_i < 30)
 +                    qp = qp_i;
 +                else if (qp_i > 43)
 +                    qp = qp_i - 6;
 +                else
 +                    qp = qp_c[qp_i - 30];
 +            } else {
 +                if (qp_i > 51)
 +                    qp = 51;
 +                else
 +                    qp = qp_i;
 +            }
 +
 +            qp += s->ps.sps->qp_bd_offset;
 +        }
 +
 +        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
 +        add      = 1 << (shift-1);
 +        scale    = level_scale[rem6[qp]] << (div6[qp]);
 +        scale_m  = 16; // default when no custom scaling lists.
 +        dc_scale = 16;
 +
 +        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
 +            const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
 +            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
 +            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
 +
 +            matrix_id = 3 * matrix_id + c_idx;
 +
 +            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
 +            if (log2_trafo_size >= 4)
 +                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
 +        }
 +    } else {
 +        shift        = 0;
 +        add          = 0;
 +        scale        = 0;
 +        dc_scale     = 0;
 +    }
 +
 +    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
 +        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
 +        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
 +        if (explicit_rdpcm_flag) {
 +            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
 +        }
 +    }
 +
 +    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
 +                                           &last_significant_coeff_x, &last_significant_coeff_y);
 +
 +    if (last_significant_coeff_x > 3) {
 +        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
 +        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
 +        (2 + (last_significant_coeff_x & 1)) +
 +        suffix;
 +    }
 +
 +    if (last_significant_coeff_y > 3) {
 +        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
 +        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
 +        (2 + (last_significant_coeff_y & 1)) +
 +        suffix;
 +    }
 +
 +    if (scan_idx == SCAN_VERT)
 +        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
 +
 +    x_cg_last_sig = last_significant_coeff_x >> 2;
 +    y_cg_last_sig = last_significant_coeff_y >> 2;
 +
 +    switch (scan_idx) {
 +    case SCAN_DIAG: {
 +        int last_x_c = last_significant_coeff_x & 3;
 +        int last_y_c = last_significant_coeff_y & 3;
 +
 +        scan_x_off = ff_hevc_diag_scan4x4_x;
 +        scan_y_off = ff_hevc_diag_scan4x4_y;
 +        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
 +        if (trafo_size == 4) {
 +            scan_x_cg = scan_1x1;
 +            scan_y_cg = scan_1x1;
 +        } else if (trafo_size == 8) {
 +            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
 +            scan_x_cg = diag_scan2x2_x;
 +            scan_y_cg = diag_scan2x2_y;
 +        } else if (trafo_size == 16) {
 +            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
 +            scan_x_cg = ff_hevc_diag_scan4x4_x;
 +            scan_y_cg = ff_hevc_diag_scan4x4_y;
 +        } else { // trafo_size == 32
 +            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
 +            scan_x_cg = ff_hevc_diag_scan8x8_x;
 +            scan_y_cg = ff_hevc_diag_scan8x8_y;
 +        }
 +        break;
 +    }
 +    case SCAN_HORIZ:
 +        scan_x_cg = horiz_scan2x2_x;
 +        scan_y_cg = horiz_scan2x2_y;
 +        scan_x_off = horiz_scan4x4_x;
 +        scan_y_off = horiz_scan4x4_y;
 +        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
 +        break;
 +    default: //SCAN_VERT
 +        scan_x_cg = horiz_scan2x2_y;
 +        scan_y_cg = horiz_scan2x2_x;
 +        scan_x_off = horiz_scan4x4_y;
 +        scan_y_off = horiz_scan4x4_x;
 +        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
 +        break;
 +    }
 +    num_coeff++;
 +    num_last_subset = (num_coeff - 1) >> 4;
 +
 +    for (i = num_last_subset; i >= 0; i--) {
 +        int n, m;
 +        int x_cg, y_cg, x_c, y_c, pos;
 +        int implicit_non_zero_coeff = 0;
 +        int64_t trans_coeff_level;
 +        int prev_sig = 0;
 +        int offset = i << 4;
 +        int rice_init = 0;
 +
 +        uint8_t significant_coeff_flag_idx[16];
 +        uint8_t nb_significant_coeff_flag = 0;
 +
 +        x_cg = scan_x_cg[i];
 +        y_cg = scan_y_cg[i];
 +
 +        if ((i < num_last_subset) && (i > 0)) {
 +            int ctx_cg = 0;
 +            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
 +                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
 +            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
 +                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
 +
 +            significant_coeff_group_flag[x_cg][y_cg] =
 +                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
 +            implicit_non_zero_coeff = 1;
 +        } else {
 +            significant_coeff_group_flag[x_cg][y_cg] =
 +            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
 +             (x_cg == 0 && y_cg == 0));
 +        }
 +
 +        last_scan_pos = num_coeff - offset - 1;
 +
 +        if (i == num_last_subset) {
 +            n_end = last_scan_pos - 1;
 +            significant_coeff_flag_idx[0] = last_scan_pos;
 +            nb_significant_coeff_flag = 1;
 +        } else {
 +            n_end = 15;
 +        }
 +
 +        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
 +            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
 +        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
 +            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
 +
 +        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
 +            static const uint8_t ctx_idx_map[] = {
 +                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
 +                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
 +                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
 +                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
 +                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
 +            };
 +            const uint8_t *ctx_idx_map_p;
 +            int scf_offset = 0;
 +            if (s->ps.sps->transform_skip_context_enabled_flag &&
 +                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
 +                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
 +                if (c_idx == 0) {
 +                    scf_offset = 40;
 +                } else {
 +                    scf_offset = 14 + 27;
 +                }
 +            } else {
 +                if (c_idx != 0)
 +                    scf_offset = 27;
 +                if (log2_trafo_size == 2) {
 +                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
 +                } else {
 +                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
 +                    if (c_idx == 0) {
 +                        if ((x_cg > 0 || y_cg > 0))
 +                            scf_offset += 3;
 +                        if (log2_trafo_size == 3) {
 +                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
 +                        } else {
 +                            scf_offset += 21;
 +                        }
 +                    } else {
 +                        if (log2_trafo_size == 3)
 +                            scf_offset += 9;
 +                        else
 +                            scf_offset += 12;
 +                    }
 +                }
 +            }
 +            for (n = n_end; n > 0; n--) {
 +                x_c = scan_x_off[n];
 +                y_c = scan_y_off[n];
 +                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
 +                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
 +                    nb_significant_coeff_flag++;
 +                    implicit_non_zero_coeff = 0;
 +                }
 +            }
 +            if (implicit_non_zero_coeff == 0) {
 +                if (s->ps.sps->transform_skip_context_enabled_flag &&
 +                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
 +                    if (c_idx == 0) {
 +                        scf_offset = 42;
 +                    } else {
 +                        scf_offset = 16 + 27;
 +                    }
 +                } else {
 +                    if (i == 0) {
 +                        if (c_idx == 0)
 +                            scf_offset = 0;
 +                        else
 +                            scf_offset = 27;
 +                    } else {
 +                        scf_offset = 2 + scf_offset;
 +                    }
 +                }
 +                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
 +                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
 +                    nb_significant_coeff_flag++;
 +                }
 +            } else {
 +                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
 +                nb_significant_coeff_flag++;
 +            }
 +        }
 +
 +        n_end = nb_significant_coeff_flag;
 +
 +
 +        if (n_end) {
 +            int first_nz_pos_in_cg;
 +            int last_nz_pos_in_cg;
 +            int c_rice_param = 0;
 +            int first_greater1_coeff_idx = -1;
 +            uint8_t coeff_abs_level_greater1_flag[8];
 +            uint16_t coeff_sign_flag;
 +            int sum_abs = 0;
 +            int sign_hidden;
 +            int sb_type;
 +
 +
 +            // initialize first elem of coeff_bas_level_greater1_flag
 +            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
 +
 +            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
 +                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
 +                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
 +                else
 +                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
 +                c_rice_param = lc->stat_coeff[sb_type] / 4;
 +            }
 +
 +            if (!(i == num_last_subset) && greater1_ctx == 0)
 +                ctx_set++;
 +            greater1_ctx = 1;
 +            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
 +
 +            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
 +                int inc = (ctx_set << 2) + greater1_ctx;
 +                coeff_abs_level_greater1_flag[m] =
 +                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
 +                if (coeff_abs_level_greater1_flag[m]) {
 +                    greater1_ctx = 0;
 +                    if (first_greater1_coeff_idx == -1)
 +                        first_greater1_coeff_idx = m;
 +                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
 +                    greater1_ctx++;
 +                }
 +            }
 +            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
 +
 +            if (lc->cu.cu_transquant_bypass_flag ||
 +                (lc->cu.pred_mode ==  MODE_INTRA  &&
 +                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
 +                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
 +                 explicit_rdpcm_flag)
 +                sign_hidden = 0;
 +            else
 +                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
 +
 +            if (first_greater1_coeff_idx != -1) {
 +                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
 +            }
 +            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
 +                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
 +            } else {
 +                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
 +            }
 +
 +            for (m = 0; m < n_end; m++) {
 +                n = significant_coeff_flag_idx[m];
 +                GET_COORD(offset, n);
 +                if (m < 8) {
 +                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
 +                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
 +                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
 +
 +                        trans_coeff_level += last_coeff_abs_level_remaining;
 +                        if (trans_coeff_level > (3 << c_rice_param))
 +                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
 +                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
 +                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
 +                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
 +                                lc->stat_coeff[sb_type]++;
 +                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
 +                                if (lc->stat_coeff[sb_type] > 0)
 +                                    lc->stat_coeff[sb_type]--;
 +                            rice_init = 1;
 +                        }
 +                    }
 +                } else {
 +                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
 +
 +                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
 +                    if (trans_coeff_level > (3 << c_rice_param))
 +                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
 +                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
 +                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
 +                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
 +                            lc->stat_coeff[sb_type]++;
 +                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
 +                            if (lc->stat_coeff[sb_type] > 0)
 +                                lc->stat_coeff[sb_type]--;
 +                        rice_init = 1;
 +                    }
 +                }
 +                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
 +                    sum_abs += trans_coeff_level;
 +                    if (n == first_nz_pos_in_cg && (sum_abs&1))
 +                        trans_coeff_level = -trans_coeff_level;
 +                }
 +                if (coeff_sign_flag >> 15)
 +                    trans_coeff_level = -trans_coeff_level;
 +                coeff_sign_flag <<= 1;
 +                if(!lc->cu.cu_transquant_bypass_flag) {
 +                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
 +                        if(y_c || x_c || log2_trafo_size < 4) {
 +                            switch(log2_trafo_size) {
 +                                case 3: pos = (y_c << 3) + x_c; break;
 +                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
 +                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
 +                                default: pos = (y_c << 2) + x_c; break;
 +                            }
 +                            scale_m = scale_matrix[pos];
 +                        } else {
 +                            scale_m = dc_scale;
 +                        }
 +                    }
 +                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
 +                    if(trans_coeff_level < 0) {
 +                        if((~trans_coeff_level) & 0xFffffffffff8000)
 +                            trans_coeff_level = -32768;
 +                    } else {
 +                        if(trans_coeff_level & 0xffffffffffff8000)
 +                            trans_coeff_level = 32767;
 +                    }
 +                }
 +                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
 +            }
 +        }
 +    }
 +
 +    if (lc->cu.cu_transquant_bypass_flag) {
 +        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
 +                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
 +            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
 +
 +            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
 +        }
 +    } else {
 +        if (transform_skip_flag) {
 +            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
 +                      log2_trafo_size == 2 &&
 +                      lc->cu.pred_mode == MODE_INTRA;
 +            if (rot) {
 +                for (i = 0; i < 8; i++)
 +                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
 +            }
 +
-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
++            s->hevcdsp.dequant(coeffs, log2_trafo_size);
 +
 +            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
 +                                        lc->cu.pred_mode == MODE_INTRA &&
 +                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
 +                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
 +
 +                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
 +            }
 +        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-             s->hevcdsp.idct_4x4_luma(coeffs);
++            s->hevcdsp.transform_4x4_luma(coeffs);
 +        } else {
 +            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
 +            if (max_xy == 0)
 +                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
 +            else {
 +                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
 +                if (max_xy < 4)
 +                    col_limit = FFMIN(4, col_limit);
 +                else if (max_xy < 8)
 +                    col_limit = FFMIN(8, col_limit);
 +                else if (max_xy < 12)
 +                    col_limit = FFMIN(24, col_limit);
 +                s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
 +            }
 +        }
 +    }
 +    if (lc->tu.cross_pf) {
 +        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
 +
 +        for (i = 0; i < (trafo_size * trafo_size); i++) {
 +            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
 +        }
 +    }
-     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
++    s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
 +}
 +
 +void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
 +{
 +    HEVCLocalContext *lc = s->HEVClc;
 +    int x = abs_mvd_greater0_flag_decode(s);
 +    int y = abs_mvd_greater0_flag_decode(s);
 +
 +    if (x)
 +        x += abs_mvd_greater1_flag_decode(s);
 +    if (y)
 +        y += abs_mvd_greater1_flag_decode(s);
 +
 +    switch (x) {
 +    case 2: lc->pu.mvd.x = mvd_decode(s);           break;
 +    case 1: lc->pu.mvd.x = mvd_sign_flag_decode(s); break;
 +    case 0: lc->pu.mvd.x = 0;                       break;
 +    }
 +
 +    switch (y) {
 +    case 2: lc->pu.mvd.y = mvd_decode(s);           break;
 +    case 1: lc->pu.mvd.y = mvd_sign_flag_decode(s); break;
 +    case 0: lc->pu.mvd.y = 0;                       break;
 +    }
 +}
 +
diff --cc libavcodec/hevcdsp.c
index 9d773d9,4bd3d97..23e923f
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@@ -195,13 -164,12 +195,13 @@@ void ff_hevc_dsp_init(HEVCDSPContext *h
  
  #define HEVC_DSP(depth)                                                     \
      hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
-     hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
-     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
-     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
-     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+     hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
+     hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
+     hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
+     hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
+     hevcdsp->dequant                = FUNC(dequant, depth);                 \
 +    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+     hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
      hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
      hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
      hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
diff --cc libavcodec/hevcdsp.h
index 9f1f6dd,decd1c9..3b7e737
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@@ -43,82 -39,76 +43,82 @@@ typedef struct SAOParams 
  } SAOParams;
  
  typedef struct HEVCDSPContext {
 -    void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
 -                    GetBitContext *gb, int pcm_bit_depth);
 +    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
 +                    struct GetBitContext *gb, int pcm_bit_depth);
 +
-     void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
++    void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
 +
-     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
++    void (*dequant)(int16_t *coeffs, int16_t log2_size);
  
 -    void (*add_residual[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
  
-     void (*idct_4x4_luma)(int16_t *coeffs);
 -    void (*dequant)(int16_t *coeffs);
+     void (*transform_4x4_luma)(int16_t *coeffs);
 -    void (*idct[4])(int16_t *coeffs);
 -
 -    void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 -                               struct SAOParams *sao, int *borders,
 -                               int width, int height, int c_idx);
 -    void (*sao_edge_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 -                               struct SAOParams *sao, int *borders, int width,
 -                               int height, int c_idx, uint8_t vert_edge,
 -                               uint8_t horiz_edge, uint8_t diag_edge);
 -
 -    void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
 -                                   ptrdiff_t srcstride, int height,
 -                                   int mx, int my, int16_t *mcbuffer);
 -    void (*put_hevc_epel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
 -                                   ptrdiff_t srcstride, int height,
 -                                   int mx, int my, int16_t *mcbuffer);
 -
 -    void (*put_unweighted_pred[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
 -                                   ptrdiff_t srcstride, int height);
 -    void (*put_unweighted_pred_chroma[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
 -                                          ptrdiff_t srcstride, int height);
 -    void (*put_unweighted_pred_avg[8])(uint8_t *dst, ptrdiff_t dststride,
 -                                       int16_t *src1, int16_t *src2,
 -                                       ptrdiff_t srcstride, int height);
 -    void (*put_unweighted_pred_avg_chroma[8])(uint8_t *dst, ptrdiff_t dststride,
 -                                              int16_t *src1, int16_t *src2,
 -                                              ptrdiff_t srcstride, int height);
 -    void (*weighted_pred[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
 -                             uint8_t *dst, ptrdiff_t dststride, int16_t *src,
 -                             ptrdiff_t srcstride, int height);
 -    void (*weighted_pred_chroma[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
 -                                    uint8_t *dst, ptrdiff_t dststride, int16_t *src,
 -                                    ptrdiff_t srcstride, int height);
 -    void (*weighted_pred_avg[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
 -                                 int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
 -                                 ptrdiff_t dststride, int16_t *src1, int16_t *src2,
 -                                 ptrdiff_t srcstride, int height);
 -    void (*weighted_pred_avg_chroma[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
 -                                        int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
 -                                        ptrdiff_t dststride, int16_t *src1, int16_t *src2,
 -                                        ptrdiff_t srcstride, int height);
 +
 +    void (*idct[4])(int16_t *coeffs, int col_limit);
 +
 +    void (*idct_dc[4])(int16_t *coeffs);
 +
 +    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
 +
 +    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
 +    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
 +                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
 +
 +    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
 +                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
 +
 +    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
 +                                    int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
 +                                        int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
 +
 +    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                       int16_t *src2,
 +                                       int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                         int16_t *src2,
 +                                         int height, int denom, int wx0, int wx1,
 +                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
 +                                    int height, intptr_t mx, intptr_t my, int width);
 +
 +    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                        int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                       int16_t *src2,
 +                                       int height, intptr_t mx, intptr_t my, int width);
 +    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
 +                                         int16_t *src2,
 +                                         int height, int denom, int wx0, int ox0, int wx1,
 +                                         int ox1, intptr_t mx, intptr_t my, int width);
  
      void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 -                                    int beta, int *tc,
 +                                    int beta, int32_t *tc,
                                      uint8_t *no_p, uint8_t *no_q);
      void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 -                                    int beta, int *tc,
 +                                    int beta, int32_t *tc,
                                      uint8_t *no_p, uint8_t *no_q);
      void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
 -                                      int *tc, uint8_t *no_p, uint8_t *no_q);
 +                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
      void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
 -                                      int *tc, uint8_t *no_p, uint8_t *no_q);
 +                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
      void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
 -                                      int beta, int *tc,
 +                                      int beta, int32_t *tc,
                                        uint8_t *no_p, uint8_t *no_q);
      void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
 -                                      int beta, int *tc,
 +                                      int beta, int32_t *tc,
                                        uint8_t *no_p, uint8_t *no_q);
      void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
 -                                        int *tc, uint8_t *no_p,
 +                                        int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
 -                                        int *tc, uint8_t *no_p,
 +                                        int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
  } HEVCDSPContext;
  
diff --cc libavcodec/hevcdsp_template.c
index 78eb354,b4816db..66b1ac0
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@@ -59,76 -57,43 +59,74 @@@ static av_always_inline void FUNC(add_r
      }
  }
  
- static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride)
+ static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
+                                   ptrdiff_t stride)
  {
-     FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+     FUNC(add_residual)(_dst, res, stride, 4);
  }
  
- static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride)
+ static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
+                                   ptrdiff_t stride)
  {
-     FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
+     FUNC(add_residual)(_dst, res, stride, 8);
  }
  
- static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
-                                          ptrdiff_t stride)
+ static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
+                                     ptrdiff_t stride)
  {
-     FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
+     FUNC(add_residual)(_dst, res, stride, 16);
  }
  
- static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
-                                          ptrdiff_t stride)
+ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
+                                     ptrdiff_t stride)
  {
-     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+     FUNC(add_residual)(_dst, res, stride, 32);
  }
  
 -static void FUNC(dequant)(int16_t *coeffs)
 +
 +static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
  {
 -    int shift  = 13 - BIT_DEPTH;
 -#if BIT_DEPTH <= 13
 -    int offset = 1 << (shift - 1);
 -#else
 -    int offset = 0;
 -#endif
 +    int16_t *coeffs = (int16_t *) _coeffs;
      int x, y;
 +    int size = 1 << log2_size;
 +
 +    if (mode) {
 +        coeffs += size;
 +        for (y = 0; y < size - 1; y++) {
 +            for (x = 0; x < size; x++)
 +                coeffs[x] += coeffs[x - size];
 +            coeffs += size;
 +        }
 +    } else {
 +        for (y = 0; y < size; y++) {
 +            for (x = 1; x < size; x++)
 +                coeffs[x] += coeffs[x - 1];
 +            coeffs += size;
 +        }
 +    }
 +}
  
- static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
 -    for (y = 0; y < 4 * 4; y += 4) {
 -        for (x = 0; x < 4; x++)
 -            coeffs[y + x] = (coeffs[y + x] + offset) >> shift;
++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
 +{
 +    int shift  = 15 - BIT_DEPTH - log2_size;
 +    int x, y;
 +    int size = 1 << log2_size;
-     int16_t *coeffs = _coeffs;
- 
 +
 +    if (shift > 0) {
 +        int offset = 1 << (shift - 1);
 +        for (y = 0; y < size; y++) {
 +            for (x = 0; x < size; x++) {
 +                *coeffs = (*coeffs + offset) >> shift;
 +                coeffs++;
 +            }
 +        }
 +    } else {
 +        for (y = 0; y < size; y++) {
 +            for (x = 0; x < size; x++) {
 +                *coeffs = *coeffs << -shift;
 +                coeffs++;
 +            }
 +        }
      }
  }
  
@@@ -174,137 -137,157 +170,136 @@@ static void FUNC(transform_4x4_luma)(in
  
  #undef TR_4x4_LUMA
  
 -#define TR_4(dst, src, dstep, sstep, assign)                            \
 -    do {                                                                \
 -        const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
 -                       transform[8 * 2][0] * src[2 * sstep];            \
 -        const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
 -                       transform[8 * 2][1] * src[2 * sstep];            \
 -        const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
 -                       transform[8 * 3][0] * src[3 * sstep];            \
 -        const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
 -                       transform[8 * 3][1] * src[3 * sstep];            \
 -                                                                        \
 -        assign(dst[0 * dstep], e0 + o0);                                \
 -        assign(dst[1 * dstep], e1 + o1);                                \
 -        assign(dst[2 * dstep], e1 - o1);                                \
 -        assign(dst[3 * dstep], e0 - o0);                                \
 +#define TR_4(dst, src, dstep, sstep, assign, end)                              \
 +    do {                                                                       \
 +        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
 +        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
 +        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
 +        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
 +                                                                               \
 +        assign(dst[0 * dstep], e0 + o0);                                       \
 +        assign(dst[1 * dstep], e1 + o1);                                       \
 +        assign(dst[2 * dstep], e1 - o1);                                       \
 +        assign(dst[3 * dstep], e0 - o0);                                       \
      } while (0)
  
 -static void FUNC(idct_4x4)(int16_t *coeffs)
 -{
 -    int i;
 -    int shift    = 7;
 -    int add      = 1 << (shift - 1);
 -    int16_t *src = coeffs;
 -
 -    for (i = 0; i < 4; i++) {
 -        TR_4(src, src, 4, 4, SCALE);
 -        src++;
 -    }
 -
 -    shift = 20 - BIT_DEPTH;
 -    add   = 1 << (shift - 1);
 -    for (i = 0; i < 4; i++) {
 -        TR_4(coeffs, coeffs, 1, 1, SCALE);
 -        coeffs += 4;
 -    }
 -}
 -
 -#define TR_8(dst, src, dstep, sstep, assign)                      \
 -    do {                                                          \
 -        int i, j;                                                 \
 -        int e_8[4];                                               \
 -        int o_8[4] = { 0 };                                       \
 -        for (i = 0; i < 4; i++)                                   \
 -            for (j = 1; j < 8; j += 2)                            \
 -                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
 -        TR_4(e_8, src, 1, 2 * sstep, SET);                        \
 -                                                                  \
 -        for (i = 0; i < 4; i++) {                                 \
 -            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
 -            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
 -        }                                                         \
 +#define TR_8(dst, src, dstep, sstep, assign, end)                              \
 +    do {                                                                       \
 +        int i, j;                                                              \
 +        int e_8[4];                                                            \
 +        int o_8[4] = { 0 };                                                    \
 +        for (i = 0; i < 4; i++)                                                \
 +            for (j = 1; j < end; j += 2)                                       \
 +                o_8[i] += transform[4 * j][i] * src[j * sstep];                \
 +        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
 +                                                                               \
 +        for (i = 0; i < 4; i++) {                                              \
 +            assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
 +            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
 +        }                                                                      \
      } while (0)
  
 -#define TR_16(dst, src, dstep, sstep, assign)                     \
 -    do {                                                          \
 -        int i, j;                                                 \
 -        int e_16[8];                                              \
 -        int o_16[8] = { 0 };                                      \
 -        for (i = 0; i < 8; i++)                                   \
 -            for (j = 1; j < 16; j += 2)                           \
 -                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
 -        TR_8(e_16, src, 1, 2 * sstep, SET);                       \
 -                                                                  \
 -        for (i = 0; i < 8; i++) {                                 \
 -            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
 -            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
 -        }                                                         \
 +#define TR_16(dst, src, dstep, sstep, assign, end)                             \
 +    do {                                                                       \
 +        int i, j;                                                              \
 +        int e_16[8];                                                           \
 +        int o_16[8] = { 0 };                                                   \
 +        for (i = 0; i < 8; i++)                                                \
 +            for (j = 1; j < end; j += 2)                                       \
 +                o_16[i] += transform[2 * j][i] * src[j * sstep];               \
 +        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
 +                                                                               \
 +        for (i = 0; i < 8; i++) {                                              \
 +            assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
 +            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
 +        }                                                                      \
      } while (0)
  
 -#define TR_32(dst, src, dstep, sstep, assign)                     \
 -    do {                                                          \
 -        int i, j;                                                 \
 -        int e_32[16];                                             \
 -        int o_32[16] = { 0 };                                     \
 -        for (i = 0; i < 16; i++)                                  \
 -            for (j = 1; j < 32; j += 2)                           \
 -                o_32[i] += transform[j][i] * src[j * sstep];      \
 -        TR_16(e_32, src, 1, 2 * sstep, SET);                      \
 -                                                                  \
 -        for (i = 0; i < 16; i++) {                                \
 -            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
 -            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
 -        }                                                         \
 +#define TR_32(dst, src, dstep, sstep, assign, end)                             \
 +    do {                                                                       \
 +        int i, j;                                                              \
 +        int e_32[16];                                                          \
 +        int o_32[16] = { 0 };                                                  \
 +        for (i = 0; i < 16; i++)                                               \
 +            for (j = 1; j < end; j += 2)                                       \
 +                o_32[i] += transform[j][i] * src[j * sstep];                   \
 +        TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
 +                                                                               \
 +        for (i = 0; i < 16; i++) {                                             \
 +            assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
 +            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
 +        }                                                                      \
      } while (0)
  
 -
 -
 -static void FUNC(idct_8x8)(int16_t *coeffs)
 -{
 -    int i;
 -    int shift    = 7;
 -    int add      = 1 << (shift - 1);
 -    int16_t *src = coeffs;
 -
 -    for (i = 0; i < 8; i++) {
 -        TR_8(src, src, 8, 8, SCALE);
 -        src++;
 -    }
 -
 -    shift = 20 - BIT_DEPTH;
 -    add   = 1 << (shift - 1);
 -    for (i = 0; i < 8; i++) {
 -        TR_8(coeffs, coeffs, 1, 1, SCALE);
 -        coeffs += 8;
 -    }
 +#define IDCT_VAR4(H)                                                          \
 +    int      limit2   = FFMIN(col_limit + 4, H)
 +#define IDCT_VAR8(H)                                                          \
 +        int      limit   = FFMIN(col_limit, H);                               \
 +        int      limit2   = FFMIN(col_limit + 4, H)
 +#define IDCT_VAR16(H)   IDCT_VAR8(H)
 +#define IDCT_VAR32(H)   IDCT_VAR8(H)
 +
 +#define IDCT(H)                                                              \
 +static void FUNC(idct_##H ##x ##H )(                                         \
 +                   int16_t *coeffs, int col_limit) {                         \
 +    int i;                                                                   \
 +    int      shift   = 7;                                                    \
 +    int      add     = 1 << (shift - 1);                                     \
 +    int16_t *src     = coeffs;                                               \
 +    IDCT_VAR ##H(H);                                                         \
 +                                                                             \
 +    for (i = 0; i < H; i++) {                                                \
 +        TR_ ## H(src, src, H, H, SCALE, limit2);                             \
 +        if (limit2 < H && i%4 == 0 && !!i)                                   \
 +            limit2 -= 4;                                                     \
 +        src++;                                                               \
 +    }                                                                        \
 +                                                                             \
 +    shift   = 20 - BIT_DEPTH;                                                \
 +    add     = 1 << (shift - 1);                                              \
 +    for (i = 0; i < H; i++) {                                                \
 +        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
 +        coeffs += H;                                                         \
 +    }                                                                        \
  }
  
 -static void FUNC(idct_16x16)(int16_t *coeffs)
 -{
 -    int i;
 -    int shift    = 7;
 -    int add      = 1 << (shift - 1);
 -    int16_t *src = coeffs;
 +#define IDCT_DC(H)                                                           \
 +static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
 +                   int16_t *coeffs) {                                        \
 +    int i, j;                                                                \
 +    int      shift   = 14 - BIT_DEPTH;                                       \
 +    int      add     = 1 << (shift - 1);                                     \
 +    int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
 +                                                                             \
 +    for (j = 0; j < H; j++) {                                                \
 +        for (i = 0; i < H; i++) {                                            \
 +            coeffs[i+j*H] = coeff;                                           \
 +        }                                                                    \
 +    }                                                                        \
 +}
  
 -    for (i = 0; i < 16; i++) {
 -        TR_16(src, src, 16, 16, SCALE);
 -        src++;
 -    }
 +IDCT( 4)
 +IDCT( 8)
 +IDCT(16)
 +IDCT(32)
  
 -    shift = 20 - BIT_DEPTH;
 -    add   = 1 << (shift - 1);
 -    for (i = 0; i < 16; i++) {
 -        TR_16(coeffs, coeffs, 1, 1, SCALE);
 -        coeffs += 16;
 -    }
 -}
 +IDCT_DC( 4)
 +IDCT_DC( 8)
 +IDCT_DC(16)
 +IDCT_DC(32)
  
 -static void FUNC(idct_32x32)(int16_t *coeffs)
 -{
 -    int i;
 -    int shift    = 7;
 -    int add      = 1 << (shift - 1);
 -    int16_t *src = coeffs;
 +#undef TR_4
 +#undef TR_8
 +#undef TR_16
 +#undef TR_32
  
 -    for (i = 0; i < 32; i++) {
 -        TR_32(src, src, 32, 32, SCALE);
 -        src++;
 -    }
 -    src   = coeffs;
 -    shift = 20 - BIT_DEPTH;
 -    add   = 1 << (shift - 1);
 -    for (i = 0; i < 32; i++) {
 -        TR_32(coeffs, coeffs, 1, 1, SCALE);
 -        coeffs += 32;
 -    }
 -}
 +#undef SET
 +#undef SCALE
- #undef ADD_AND_SCALE
  
  static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
 -                                  ptrdiff_t stride, SAOParams *sao,
 -                                  int *borders, int width, int height,
 -                                  int c_idx, int class)
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  int16_t *sao_offset_val, int sao_left_class,
 +                                  int width, int height)
  {
      pixel *dst = (pixel *)_dst;
      pixel *src = (pixel *)_src;
diff --cc libavcodec/mips/hevcdsp_init_mips.c
index 3675b93,0000000..776d13e
mode 100644,000000..100644
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@@ -1,454 -1,0 +1,454 @@@
 +/*
 + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale at imgtec.com)
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#include "libavcodec/mips/hevcdsp_mips.h"
 +
 +#if HAVE_MSA
 +static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
 +                                      const int bit_depth)
 +{
 +    if (8 == bit_depth) {
 +        c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
 +        c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
 +        c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
 +        c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
 +        c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
 +        c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
 +        c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
 +        c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_msa;
 +        c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_msa;
 +
 +        c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_msa;
 +        c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_msa;
 +        c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_msa;
 +        c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_msa;
 +        c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_msa;
 +        c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_msa;
 +        c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_msa;
 +        c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_msa;
 +
 +        c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_msa;
 +        c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_msa;
 +        c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_msa;
 +        c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_msa;
 +        c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_msa;
 +        c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_msa;
 +        c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_msa;
 +        c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_msa;
 +
 +        c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_msa;
 +        c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_msa;
 +        c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_msa;
 +        c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_msa;
 +        c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_msa;
 +        c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_msa;
 +        c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_msa;
 +        c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_msa;
 +
 +        c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
 +        c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
 +        c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
 +        c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
 +        c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
 +        c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
 +        c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
 +
 +        c->put_hevc_epel[1][0][1] = ff_hevc_put_hevc_epel_h4_8_msa;
 +        c->put_hevc_epel[2][0][1] = ff_hevc_put_hevc_epel_h6_8_msa;
 +        c->put_hevc_epel[3][0][1] = ff_hevc_put_hevc_epel_h8_8_msa;
 +        c->put_hevc_epel[4][0][1] = ff_hevc_put_hevc_epel_h12_8_msa;
 +        c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_8_msa;
 +        c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_8_msa;
 +        c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_msa;
 +
 +        c->put_hevc_epel[1][1][0] = ff_hevc_put_hevc_epel_v4_8_msa;
 +        c->put_hevc_epel[2][1][0] = ff_hevc_put_hevc_epel_v6_8_msa;
 +        c->put_hevc_epel[3][1][0] = ff_hevc_put_hevc_epel_v8_8_msa;
 +        c->put_hevc_epel[4][1][0] = ff_hevc_put_hevc_epel_v12_8_msa;
 +        c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_msa;
 +        c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_msa;
 +        c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_msa;
 +
 +        c->put_hevc_epel[1][1][1] = ff_hevc_put_hevc_epel_hv4_8_msa;
 +        c->put_hevc_epel[2][1][1] = ff_hevc_put_hevc_epel_hv6_8_msa;
 +        c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_msa;
 +        c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_msa;
 +        c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_msa;
 +        c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_msa;
 +        c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_msa;
 +
 +        c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
 +        c->put_hevc_qpel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
 +        c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
 +        c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
 +        c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
 +        c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_msa;
 +        c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_msa;
 +
 +        c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_msa;
 +        c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_msa;
 +        c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_msa;
 +        c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_msa;
 +        c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_msa;
 +        c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_msa;
 +        c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_msa;
 +        c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_msa;
 +
 +        c->put_hevc_qpel_uni[1][1][0] = ff_hevc_put_hevc_uni_qpel_v4_8_msa;
 +        c->put_hevc_qpel_uni[3][1][0] = ff_hevc_put_hevc_uni_qpel_v8_8_msa;
 +        c->put_hevc_qpel_uni[4][1][0] = ff_hevc_put_hevc_uni_qpel_v12_8_msa;
 +        c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_8_msa;
 +        c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_msa;
 +        c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa;
 +        c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa;
 +        c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa;
 +
 +        c->put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa;
 +        c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa;
 +        c->put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa;
 +        c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa;
 +        c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa;
 +        c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
 +        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
 +        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
 +
 +        c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
 +        c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
 +        c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
 +        c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
 +        c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
 +
 +        c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa;
 +        c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa;
 +        c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa;
 +        c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa;
 +        c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa;
 +        c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa;
 +        c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa;
 +
 +        c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa;
 +        c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa;
 +        c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa;
 +        c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa;
 +        c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa;
 +        c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa;
 +        c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa;
 +
 +        c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa;
 +        c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa;
 +        c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa;
 +        c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa;
 +        c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa;
 +        c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa;
 +        c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa;
 +
 +        c->put_hevc_qpel_uni_w[1][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
 +        c->put_hevc_qpel_uni_w[3][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
 +        c->put_hevc_qpel_uni_w[4][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
 +        c->put_hevc_qpel_uni_w[5][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
 +        c->put_hevc_qpel_uni_w[6][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
 +        c->put_hevc_qpel_uni_w[7][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
 +        c->put_hevc_qpel_uni_w[8][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels48_8_msa;
 +        c->put_hevc_qpel_uni_w[9][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels64_8_msa;
 +
 +        c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_qpel_h4_8_msa;
 +        c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_qpel_h8_8_msa;
 +        c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_qpel_h12_8_msa;
 +        c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_qpel_h16_8_msa;
 +        c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_qpel_h24_8_msa;
 +        c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_qpel_h32_8_msa;
 +        c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_uni_w_qpel_h48_8_msa;
 +        c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_uni_w_qpel_h64_8_msa;
 +
 +        c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_qpel_v4_8_msa;
 +        c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_qpel_v8_8_msa;
 +        c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_qpel_v12_8_msa;
 +        c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_qpel_v16_8_msa;
 +        c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_qpel_v24_8_msa;
 +        c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_qpel_v32_8_msa;
 +        c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_uni_w_qpel_v48_8_msa;
 +        c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_uni_w_qpel_v64_8_msa;
 +
 +        c->put_hevc_qpel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv4_8_msa;
 +        c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_msa;
 +        c->put_hevc_qpel_uni_w[4][1][1] =
 +            ff_hevc_put_hevc_uni_w_qpel_hv12_8_msa;
 +        c->put_hevc_qpel_uni_w[5][1][1] =
 +            ff_hevc_put_hevc_uni_w_qpel_hv16_8_msa;
 +        c->put_hevc_qpel_uni_w[6][1][1] =
 +            ff_hevc_put_hevc_uni_w_qpel_hv24_8_msa;
 +        c->put_hevc_qpel_uni_w[7][1][1] =
 +            ff_hevc_put_hevc_uni_w_qpel_hv32_8_msa;
 +        c->put_hevc_qpel_uni_w[8][1][1] =
 +            ff_hevc_put_hevc_uni_w_qpel_hv48_8_msa;
 +        c->put_hevc_qpel_uni_w[9][1][1] =
 +            ff_hevc_put_hevc_uni_w_qpel_hv64_8_msa;
 +
 +        c->put_hevc_epel_uni_w[1][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
 +        c->put_hevc_epel_uni_w[2][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels6_8_msa;
 +        c->put_hevc_epel_uni_w[3][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
 +        c->put_hevc_epel_uni_w[4][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
 +        c->put_hevc_epel_uni_w[5][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
 +        c->put_hevc_epel_uni_w[6][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
 +        c->put_hevc_epel_uni_w[7][0][0] =
 +            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
 +
 +        c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_epel_h4_8_msa;
 +        c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_uni_w_epel_h6_8_msa;
 +        c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_epel_h8_8_msa;
 +        c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_epel_h12_8_msa;
 +        c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_epel_h16_8_msa;
 +        c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_epel_h24_8_msa;
 +        c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_epel_h32_8_msa;
 +
 +        c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_epel_v4_8_msa;
 +        c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_uni_w_epel_v6_8_msa;
 +        c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_epel_v8_8_msa;
 +        c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_epel_v12_8_msa;
 +        c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_epel_v16_8_msa;
 +        c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_epel_v24_8_msa;
 +        c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_epel_v32_8_msa;
 +
 +        c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_epel_hv4_8_msa;
 +        c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_uni_w_epel_hv6_8_msa;
 +        c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_epel_hv8_8_msa;
 +        c->put_hevc_epel_uni_w[4][1][1] =
 +            ff_hevc_put_hevc_uni_w_epel_hv12_8_msa;
 +        c->put_hevc_epel_uni_w[5][1][1] =
 +            ff_hevc_put_hevc_uni_w_epel_hv16_8_msa;
 +        c->put_hevc_epel_uni_w[6][1][1] =
 +            ff_hevc_put_hevc_uni_w_epel_hv24_8_msa;
 +        c->put_hevc_epel_uni_w[7][1][1] =
 +            ff_hevc_put_hevc_uni_w_epel_hv32_8_msa;
 +
 +        c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
 +        c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
 +        c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
 +        c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
 +        c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
 +        c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
 +        c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_msa;
 +        c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_msa;
 +
 +        c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_bi_qpel_h4_8_msa;
 +        c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_bi_qpel_h8_8_msa;
 +        c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_bi_qpel_h12_8_msa;
 +        c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_msa;
 +        c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_msa;
 +        c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_msa;
 +        c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_msa;
 +        c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_msa;
 +
 +        c->put_hevc_qpel_bi[1][1][0] = ff_hevc_put_hevc_bi_qpel_v4_8_msa;
 +        c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_msa;
 +        c->put_hevc_qpel_bi[4][1][0] = ff_hevc_put_hevc_bi_qpel_v12_8_msa;
 +        c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_msa;
 +        c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_msa;
 +        c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_msa;
 +        c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_msa;
 +        c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_msa;
 +
 +        c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_bi_qpel_hv4_8_msa;
 +        c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_msa;
 +        c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_bi_qpel_hv12_8_msa;
 +        c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_msa;
 +        c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_msa;
 +        c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_msa;
 +        c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_msa;
 +        c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_msa;
 +
 +        c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
 +        c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_msa;
 +        c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
 +        c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
 +        c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
 +        c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
 +        c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
 +
 +        c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_msa;
 +        c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_msa;
 +        c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_msa;
 +        c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_msa;
 +        c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_msa;
 +        c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_msa;
 +        c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_msa;
 +
 +        c->put_hevc_epel_bi[1][1][0] = ff_hevc_put_hevc_bi_epel_v4_8_msa;
 +        c->put_hevc_epel_bi[2][1][0] = ff_hevc_put_hevc_bi_epel_v6_8_msa;
 +        c->put_hevc_epel_bi[3][1][0] = ff_hevc_put_hevc_bi_epel_v8_8_msa;
 +        c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_msa;
 +        c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_msa;
 +        c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_msa;
 +        c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_msa;
 +
 +        c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_bi_epel_hv4_8_msa;
 +        c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_msa;
 +        c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_msa;
 +        c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_bi_epel_hv12_8_msa;
 +        c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_msa;
 +        c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_msa;
 +        c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_msa;
 +
 +        c->put_hevc_qpel_bi_w[1][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
 +        c->put_hevc_qpel_bi_w[3][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
 +        c->put_hevc_qpel_bi_w[4][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
 +        c->put_hevc_qpel_bi_w[5][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
 +        c->put_hevc_qpel_bi_w[6][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
 +        c->put_hevc_qpel_bi_w[7][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
 +        c->put_hevc_qpel_bi_w[8][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels48_8_msa;
 +        c->put_hevc_qpel_bi_w[9][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels64_8_msa;
 +
 +        c->put_hevc_qpel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_qpel_h4_8_msa;
 +        c->put_hevc_qpel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_qpel_h8_8_msa;
 +        c->put_hevc_qpel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_qpel_h12_8_msa;
 +        c->put_hevc_qpel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_qpel_h16_8_msa;
 +        c->put_hevc_qpel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_qpel_h24_8_msa;
 +        c->put_hevc_qpel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_qpel_h32_8_msa;
 +        c->put_hevc_qpel_bi_w[8][0][1] = ff_hevc_put_hevc_bi_w_qpel_h48_8_msa;
 +        c->put_hevc_qpel_bi_w[9][0][1] = ff_hevc_put_hevc_bi_w_qpel_h64_8_msa;
 +
 +        c->put_hevc_qpel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_qpel_v4_8_msa;
 +        c->put_hevc_qpel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_qpel_v8_8_msa;
 +        c->put_hevc_qpel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_qpel_v12_8_msa;
 +        c->put_hevc_qpel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_qpel_v16_8_msa;
 +        c->put_hevc_qpel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_qpel_v24_8_msa;
 +        c->put_hevc_qpel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_qpel_v32_8_msa;
 +        c->put_hevc_qpel_bi_w[8][1][0] = ff_hevc_put_hevc_bi_w_qpel_v48_8_msa;
 +        c->put_hevc_qpel_bi_w[9][1][0] = ff_hevc_put_hevc_bi_w_qpel_v64_8_msa;
 +
 +        c->put_hevc_qpel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv4_8_msa;
 +        c->put_hevc_qpel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv8_8_msa;
 +        c->put_hevc_qpel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv12_8_msa;
 +        c->put_hevc_qpel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv16_8_msa;
 +        c->put_hevc_qpel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv24_8_msa;
 +        c->put_hevc_qpel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv32_8_msa;
 +        c->put_hevc_qpel_bi_w[8][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv48_8_msa;
 +        c->put_hevc_qpel_bi_w[9][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv64_8_msa;
 +
 +        c->put_hevc_epel_bi_w[1][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
 +        c->put_hevc_epel_bi_w[2][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels6_8_msa;
 +        c->put_hevc_epel_bi_w[3][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
 +        c->put_hevc_epel_bi_w[4][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
 +        c->put_hevc_epel_bi_w[5][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
 +        c->put_hevc_epel_bi_w[6][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
 +        c->put_hevc_epel_bi_w[7][0][0] =
 +            ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
 +
 +        c->put_hevc_epel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_epel_h4_8_msa;
 +        c->put_hevc_epel_bi_w[2][0][1] = ff_hevc_put_hevc_bi_w_epel_h6_8_msa;
 +        c->put_hevc_epel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_epel_h8_8_msa;
 +        c->put_hevc_epel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_epel_h12_8_msa;
 +        c->put_hevc_epel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_epel_h16_8_msa;
 +        c->put_hevc_epel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_epel_h24_8_msa;
 +        c->put_hevc_epel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_epel_h32_8_msa;
 +
 +        c->put_hevc_epel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_epel_v4_8_msa;
 +        c->put_hevc_epel_bi_w[2][1][0] = ff_hevc_put_hevc_bi_w_epel_v6_8_msa;
 +        c->put_hevc_epel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_epel_v8_8_msa;
 +        c->put_hevc_epel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_epel_v12_8_msa;
 +        c->put_hevc_epel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_epel_v16_8_msa;
 +        c->put_hevc_epel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_epel_v24_8_msa;
 +        c->put_hevc_epel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_epel_v32_8_msa;
 +
 +        c->put_hevc_epel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_epel_hv4_8_msa;
 +        c->put_hevc_epel_bi_w[2][1][1] = ff_hevc_put_hevc_bi_w_epel_hv6_8_msa;
 +        c->put_hevc_epel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_epel_hv8_8_msa;
 +        c->put_hevc_epel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_epel_hv12_8_msa;
 +        c->put_hevc_epel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_epel_hv16_8_msa;
 +        c->put_hevc_epel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_epel_hv24_8_msa;
 +        c->put_hevc_epel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_epel_hv32_8_msa;
 +
 +        c->sao_band_filter[0] =
 +        c->sao_band_filter[1] =
 +        c->sao_band_filter[2] =
 +        c->sao_band_filter[3] =
 +        c->sao_band_filter[4] = ff_hevc_sao_band_filter_0_8_msa;
 +
 +        c->sao_edge_filter[0] =
 +        c->sao_edge_filter[1] =
 +        c->sao_edge_filter[2] =
 +        c->sao_edge_filter[3] =
 +        c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_msa;
 +
 +        c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_msa;
 +        c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_msa;
 +
 +        c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_msa;
 +        c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_msa;
 +
 +        c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_msa;
 +        c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_msa;
 +
 +        c->hevc_h_loop_filter_chroma_c =
 +            ff_hevc_loop_filter_chroma_h_8_msa;
 +        c->hevc_v_loop_filter_chroma_c =
 +            ff_hevc_loop_filter_chroma_v_8_msa;
 +
 +        c->idct[0] = ff_hevc_idct_4x4_msa;
 +        c->idct[1] = ff_hevc_idct_8x8_msa;
 +        c->idct[2] = ff_hevc_idct_16x16_msa;
 +        c->idct[3] = ff_hevc_idct_32x32_msa;
 +        c->idct_dc[0] = ff_hevc_idct_dc_4x4_msa;
 +        c->idct_dc[1] = ff_hevc_idct_dc_8x8_msa;
 +        c->idct_dc[2] = ff_hevc_idct_dc_16x16_msa;
 +        c->idct_dc[3] = ff_hevc_idct_dc_32x32_msa;
-         c->transform_add[0] = ff_hevc_addblk_4x4_msa;
-         c->transform_add[1] = ff_hevc_addblk_8x8_msa;
-         c->transform_add[2] = ff_hevc_addblk_16x16_msa;
-         c->transform_add[3] = ff_hevc_addblk_32x32_msa;
-         c->idct_4x4_luma = ff_hevc_idct_luma_4x4_msa;
++        c->add_residual[0] = ff_hevc_addblk_4x4_msa;
++        c->add_residual[1] = ff_hevc_addblk_8x8_msa;
++        c->add_residual[2] = ff_hevc_addblk_16x16_msa;
++        c->add_residual[3] = ff_hevc_addblk_32x32_msa;
++        c->transform_4x4_luma = ff_hevc_idct_luma_4x4_msa;
 +    }
 +}
 +#endif  // #if HAVE_MSA
 +
 +void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth)
 +{
 +#if HAVE_MSA
 +    hevc_dsp_init_msa(c, bit_depth);
 +#endif  // #if HAVE_MSA
 +}
diff --cc libavcodec/x86/hevc_res_add.asm
index dc3e88a,0000000..869288f
mode 100644,000000..100644
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@@ -1,388 -1,0 +1,388 @@@
 +; /*
- ; * Provide SIMD optimizations for transform_add functions for HEVC decoding
++; * Provide SIMD optimizations for add_residual functions for HEVC decoding
 +; * Copyright (c) 2014 Pierre-Edouard LEPERE
 +; *
 +; * This file is part of FFmpeg.
 +; *
 +; * FFmpeg is free software; you can redistribute it and/or
 +; * modify it under the terms of the GNU Lesser General Public
 +; * License as published by the Free Software Foundation; either
 +; * version 2.1 of the License, or (at your option) any later version.
 +; *
 +; * FFmpeg is distributed in the hope that it will be useful,
 +; * but WITHOUT ANY WARRANTY; without even the implied warranty of
 +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +; * Lesser General Public License for more details.
 +; *
 +; * You should have received a copy of the GNU Lesser General Public
 +; * License along with FFmpeg; if not, write to the Free Software
 +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 +; */
 +%include "libavutil/x86/x86util.asm"
 +
 +SECTION .text
 +
 +cextern pw_1023
 +%define max_pixels_10 pw_1023
 +
 +
 +;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
 +%macro TR_ADD_MMX_4_8 0
 +    mova              m2, [r1]
 +    mova              m4, [r1+8]
 +    pxor              m3, m3
 +    psubw             m3, m2
 +    packuswb          m2, m2
 +    packuswb          m3, m3
 +    pxor              m5, m5
 +    psubw             m5, m4
 +    packuswb          m4, m4
 +    packuswb          m5, m5
 +
 +    movh              m0, [r0     ]
 +    movh              m1, [r0+r2  ]
 +    paddusb           m0, m2
 +    paddusb           m1, m4
 +    psubusb           m0, m3
 +    psubusb           m1, m5
 +    movh       [r0     ], m0
 +    movh       [r0+r2  ], m1
 +%endmacro
 +
 +
 +INIT_MMX mmxext
 +; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add4_8, 3, 4, 6
++cglobal hevc_add_residual4_8, 3, 4, 6
 +    TR_ADD_MMX_4_8
 +    add               r1, 16
 +    lea               r0, [r0+r2*2]
 +    TR_ADD_MMX_4_8
 +    RET
 +
 +%macro TR_ADD_SSE_8_8 0
 +    pxor              m3, m3
 +    mova              m4, [r1]
 +    mova              m6, [r1+16]
 +    mova              m0, [r1+32]
 +    mova              m2, [r1+48]
 +    psubw             m5, m3, m4
 +    psubw             m7, m3, m6
 +    psubw             m1, m3, m0
 +    packuswb          m4, m0
 +    packuswb          m5, m1
 +    psubw             m3, m2
 +    packuswb          m6, m2
 +    packuswb          m7, m3
 +
 +    movq                m0, [r0     ]
 +    movq                m1, [r0+r2  ]
 +    movhps              m0, [r0+r2*2]
 +    movhps              m1, [r0+r3  ]
 +    paddusb             m0, m4
 +    paddusb             m1, m6
 +    psubusb             m0, m5
 +    psubusb             m1, m7
 +    movq         [r0     ], m0
 +    movq         [r0+r2  ], m1
 +    movhps       [r0+2*r2], m0
 +    movhps       [r0+r3  ], m1
 +%endmacro
 +
 +%macro TR_ADD_SSE_16_32_8 3
 +    mova             xm2, [r1+%1   ]
 +    mova             xm6, [r1+%1+16]
 +%if cpuflag(avx2)
 +    vinserti128       m2, m2, [r1+%1+32], 1
 +    vinserti128       m6, m6, [r1+%1+48], 1
 +%endif
 +%if cpuflag(avx)
 +    psubw             m1, m0, m2
 +    psubw             m5, m0, m6
 +%else
 +    mova              m1, m0
 +    mova              m5, m0
 +    psubw             m1, m2
 +    psubw             m5, m6
 +%endif
 +    packuswb          m2, m6
 +    packuswb          m1, m5
 +
 +    mova             xm4, [r1+%1+mmsize*2   ]
 +    mova             xm6, [r1+%1+mmsize*2+16]
 +%if cpuflag(avx2)
 +    vinserti128       m4, m4, [r1+%1+96 ], 1
 +    vinserti128       m6, m6, [r1+%1+112], 1
 +%endif
 +%if cpuflag(avx)
 +    psubw             m3, m0, m4
 +    psubw             m5, m0, m6
 +%else
 +    mova              m3, m0
 +    mova              m5, m0
 +    psubw             m3, m4
 +    psubw             m5, m6
 +%endif
 +    packuswb          m4, m6
 +    packuswb          m3, m5
 +
 +    paddusb           m2, [%2]
 +    paddusb           m4, [%3]
 +    psubusb           m2, m1
 +    psubusb           m4, m3
 +    mova            [%2], m2
 +    mova            [%3], m4
 +%endmacro
 +
 +
 +%macro TRANSFORM_ADD_8 0
- ; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add8_8, 3, 4, 8
++; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
++cglobal hevc_add_residual8_8, 3, 4, 8
 +    lea               r3, [r2*3]
 +    TR_ADD_SSE_8_8
 +    add               r1, 64
 +    lea               r0, [r0+r2*4]
 +    TR_ADD_SSE_8_8
 +    RET
 +
- ; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add16_8, 3, 4, 7
++; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
++cglobal hevc_add_residual16_8, 3, 4, 7
 +    pxor              m0, m0
 +    lea               r3, [r2*3]
 +    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
 +    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
 +%rep 3
 +    add                r1, 128
 +    lea                r0, [r0+r2*4]
 +    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
 +    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
 +%endrep
 +    RET
 +
- ; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add32_8, 3, 4, 7
++; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
++cglobal hevc_add_residual32_8, 3, 4, 7
 +    pxor               m0, m0
 +    TR_ADD_SSE_16_32_8  0, r0,    r0+16
 +    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
 +%rep 15
 +    add                r1, 128
 +    lea                r0, [r0+r2*2]
 +    TR_ADD_SSE_16_32_8  0, r0,    r0+16
 +    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
 +%endrep
 +    RET
 +%endmacro
 +
 +INIT_XMM sse2
 +TRANSFORM_ADD_8
 +INIT_XMM avx
 +TRANSFORM_ADD_8
 +
 +%if HAVE_AVX2_EXTERNAL
 +INIT_YMM avx2
- ; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add32_8, 3, 4, 7
++; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
++cglobal hevc_add_residual32_8, 3, 4, 7
 +    pxor              m0, m0
 +    lea               r3, [r2*3]
 +    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
 +    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
 +%rep 7
 +    add                r1, 256
 +    lea                r0, [r0+r2*4]
 +    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
 +    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
 +%endrep
 +    RET
 +%endif
 +
 +;-----------------------------------------------------------------------------
- ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
++; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
 +;-----------------------------------------------------------------------------
 +%macro TR_ADD_SSE_8_10 4
 +    mova              m0, [%4]
 +    mova              m1, [%4+16]
 +    mova              m2, [%4+32]
 +    mova              m3, [%4+48]
 +    paddw             m0, [%1+0   ]
 +    paddw             m1, [%1+%2  ]
 +    paddw             m2, [%1+%2*2]
 +    paddw             m3, [%1+%3  ]
 +    CLIPW             m0, m4, m5
 +    CLIPW             m1, m4, m5
 +    CLIPW             m2, m4, m5
 +    CLIPW             m3, m4, m5
 +    mova       [%1+0   ], m0
 +    mova       [%1+%2  ], m1
 +    mova       [%1+%2*2], m2
 +    mova       [%1+%3  ], m3
 +%endmacro
 +
 +%macro TR_ADD_MMX4_10 3
 +    mova              m0, [%1+0   ]
 +    mova              m1, [%1+%2  ]
 +    paddw             m0, [%3]
 +    paddw             m1, [%3+8]
 +    CLIPW             m0, m2, m3
 +    CLIPW             m1, m2, m3
 +    mova       [%1+0   ], m0
 +    mova       [%1+%2  ], m1
 +%endmacro
 +
 +%macro TRANS_ADD_SSE_16_10 3
 +    mova              m0, [%3]
 +    mova              m1, [%3+16]
 +    mova              m2, [%3+32]
 +    mova              m3, [%3+48]
 +    paddw             m0, [%1      ]
 +    paddw             m1, [%1+16   ]
 +    paddw             m2, [%1+%2   ]
 +    paddw             m3, [%1+%2+16]
 +    CLIPW             m0, m4, m5
 +    CLIPW             m1, m4, m5
 +    CLIPW             m2, m4, m5
 +    CLIPW             m3, m4, m5
 +    mova      [%1      ], m0
 +    mova      [%1+16   ], m1
 +    mova      [%1+%2   ], m2
 +    mova      [%1+%2+16], m3
 +%endmacro
 +
 +%macro TRANS_ADD_SSE_32_10 2
 +    mova              m0, [%2]
 +    mova              m1, [%2+16]
 +    mova              m2, [%2+32]
 +    mova              m3, [%2+48]
 +
 +    paddw             m0, [%1   ]
 +    paddw             m1, [%1+16]
 +    paddw             m2, [%1+32]
 +    paddw             m3, [%1+48]
 +    CLIPW             m0, m4, m5
 +    CLIPW             m1, m4, m5
 +    CLIPW             m2, m4, m5
 +    CLIPW             m3, m4, m5
 +    mova         [%1   ], m0
 +    mova         [%1+16], m1
 +    mova         [%1+32], m2
 +    mova         [%1+48], m3
 +%endmacro
 +
 +%macro TRANS_ADD16_AVX2 4
 +    mova              m0, [%4]
 +    mova              m1, [%4+32]
 +    mova              m2, [%4+64]
 +    mova              m3, [%4+96]
 +
 +    paddw             m0, [%1+0   ]
 +    paddw             m1, [%1+%2  ]
 +    paddw             m2, [%1+%2*2]
 +    paddw             m3, [%1+%3  ]
 +
 +    CLIPW             m0, m4, m5
 +    CLIPW             m1, m4, m5
 +    CLIPW             m2, m4, m5
 +    CLIPW             m3, m4, m5
 +    mova       [%1+0   ], m0
 +    mova       [%1+%2  ], m1
 +    mova       [%1+%2*2], m2
 +    mova       [%1+%3  ], m3
 +%endmacro
 +
 +%macro TRANS_ADD32_AVX2 3
 +    mova              m0, [%3]
 +    mova              m1, [%3+32]
 +    mova              m2, [%3+64]
 +    mova              m3, [%3+96]
 +
 +    paddw             m0, [%1      ]
 +    paddw             m1, [%1+32   ]
 +    paddw             m2, [%1+%2   ]
 +    paddw             m3, [%1+%2+32]
 +
 +    CLIPW             m0, m4, m5
 +    CLIPW             m1, m4, m5
 +    CLIPW             m2, m4, m5
 +    CLIPW             m3, m4, m5
 +    mova      [%1      ], m0
 +    mova      [%1+32   ], m1
 +    mova      [%1+%2   ], m2
 +    mova      [%1+%2+32], m3
 +%endmacro
 +
 +
 +INIT_MMX mmxext
- cglobal hevc_transform_add4_10,3,4, 6
++cglobal hevc_add_residual4_10,3,4, 6
 +    pxor              m2, m2
 +    mova              m3, [max_pixels_10]
 +    TR_ADD_MMX4_10     r0, r2, r1
 +    add               r1, 16
 +    lea               r0, [r0+2*r2]
 +    TR_ADD_MMX4_10     r0, r2, r1
 +    RET
 +
 +;-----------------------------------------------------------------------------
- ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
++; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
 +;-----------------------------------------------------------------------------
 +INIT_XMM sse2
- cglobal hevc_transform_add8_10,3,4,6
++cglobal hevc_add_residual8_10,3,4,6
 +    pxor              m4, m4
 +    mova              m5, [max_pixels_10]
 +    lea               r3, [r2*3]
 +
 +    TR_ADD_SSE_8_10      r0, r2, r3, r1
 +    lea               r0, [r0+r2*4]
 +    add               r1, 64
 +    TR_ADD_SSE_8_10      r0, r2, r3, r1
 +    RET
 +
- cglobal hevc_transform_add16_10,3,4,6
++cglobal hevc_add_residual16_10,3,4,6
 +    pxor              m4, m4
 +    mova              m5, [max_pixels_10]
 +
 +    TRANS_ADD_SSE_16_10 r0, r2, r1
 +%rep 7
 +    lea                 r0, [r0+r2*2]
 +    add                 r1, 64
 +    TRANS_ADD_SSE_16_10 r0, r2, r1
 +%endrep
 +    RET
 +
- cglobal hevc_transform_add32_10,3,4,6
++cglobal hevc_add_residual32_10,3,4,6
 +    pxor              m4, m4
 +    mova              m5, [max_pixels_10]
 +
 +    TRANS_ADD_SSE_32_10 r0, r1
 +%rep 31
 +    lea                 r0, [r0+r2]
 +    add                 r1, 64
 +    TRANS_ADD_SSE_32_10 r0, r1
 +%endrep
 +    RET
 +
 +%if HAVE_AVX2_EXTERNAL
 +INIT_YMM avx2
 +
- cglobal hevc_transform_add16_10,3,4,6
++cglobal hevc_add_residual16_10,3,4,6
 +    pxor              m4, m4
 +    mova              m5, [max_pixels_10]
 +    lea               r3, [r2*3]
 +
 +    TRANS_ADD16_AVX2  r0, r2, r3, r1
 +%rep 3
 +    lea               r0, [r0+r2*4]
 +    add               r1, 128
 +    TRANS_ADD16_AVX2  r0, r2, r3, r1
 +%endrep
 +    RET
 +
- cglobal hevc_transform_add32_10,3,4,6
++cglobal hevc_add_residual32_10,3,4,6
 +    pxor              m4, m4
 +    mova              m5, [max_pixels_10]
 +
 +    TRANS_ADD32_AVX2  r0, r2, r1
 +%rep 15
 +    lea               r0, [r0+r2*2]
 +    add               r1, 128
 +    TRANS_ADD32_AVX2  r0, r2, r1
 +%endrep
 +    RET
 +%endif ;HAVE_AVX_EXTERNAL
diff --cc libavcodec/x86/hevcdsp.h
index ad8168f,0000000..3cfdc27
mode 100644,000000..100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@@ -1,261 -1,0 +1,261 @@@
 +/*
 + * HEVC video decoder
 + *
 + * Copyright (C) 2012 - 2013 Guillaume Martres
 + * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
 + *
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
 +#ifndef AVCODEC_X86_HEVCDSP_H
 +#define AVCODEC_X86_HEVCDSP_H
 +
 +#include <stddef.h>
 +#include <stdint.h>
 +
 +
 +#define idct_dc_proto(size, bitd, opt) \
 +                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 +
 +#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 +dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 +dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
 +dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
 +dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
 +dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
 +
 +
 +#define PEL_PROTOTYPE(name, D, opt) \
 +void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
 +void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
 +void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 +
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// MC functions
 +///////////////////////////////////////////////////////////////////////////////
 +
 +#define EPEL_PROTOTYPES(fname, bitd, opt) \
 +        PEL_PROTOTYPE(fname##4,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##6,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##8,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##12, bitd, opt); \
 +        PEL_PROTOTYPE(fname##16, bitd, opt); \
 +        PEL_PROTOTYPE(fname##24, bitd, opt); \
 +        PEL_PROTOTYPE(fname##32, bitd, opt); \
 +        PEL_PROTOTYPE(fname##48, bitd, opt); \
 +        PEL_PROTOTYPE(fname##64, bitd, opt)
 +
 +#define QPEL_PROTOTYPES(fname, bitd, opt) \
 +        PEL_PROTOTYPE(fname##4,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##8,  bitd, opt); \
 +        PEL_PROTOTYPE(fname##12, bitd, opt); \
 +        PEL_PROTOTYPE(fname##16, bitd, opt); \
 +        PEL_PROTOTYPE(fname##24, bitd, opt); \
 +        PEL_PROTOTYPE(fname##32, bitd, opt); \
 +        PEL_PROTOTYPE(fname##48, bitd, opt); \
 +        PEL_PROTOTYPE(fname##64, bitd, opt)
 +
 +#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
 +void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
 +void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
 +
 +#define WEIGHTING_PROTOTYPES(bitd, opt) \
 +        WEIGHTING_PROTOTYPE(2, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(4, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(6, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(8, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(12, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(16, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(24, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(32, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(48, bitd, opt); \
 +        WEIGHTING_PROTOTYPE(64, bitd, opt)
 +
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// QPEL_PIXELS EPEL_PIXELS
 +///////////////////////////////////////////////////////////////////////////////
 +EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
 +EPEL_PROTOTYPES(pel_pixels , 10, sse4);
 +EPEL_PROTOTYPES(pel_pixels , 12, sse4);
 +
 +void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +
 +void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +
 +
 +
 +void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
 +void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
 +void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
 +
 +
 +void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +
 +void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// EPEL
 +///////////////////////////////////////////////////////////////////////////////
 +EPEL_PROTOTYPES(epel_h ,  8, sse4);
 +EPEL_PROTOTYPES(epel_h , 10, sse4);
 +EPEL_PROTOTYPES(epel_h , 12, sse4);
 +
 +EPEL_PROTOTYPES(epel_v ,  8, sse4);
 +EPEL_PROTOTYPES(epel_v , 10, sse4);
 +EPEL_PROTOTYPES(epel_v , 12, sse4);
 +
 +EPEL_PROTOTYPES(epel_hv ,  8, sse4);
 +EPEL_PROTOTYPES(epel_hv , 10, sse4);
 +EPEL_PROTOTYPES(epel_hv , 12, sse4);
 +
 +PEL_PROTOTYPE(epel_h16, 8, avx2);
 +PEL_PROTOTYPE(epel_h24, 8, avx2);
 +PEL_PROTOTYPE(epel_h32, 8, avx2);
 +PEL_PROTOTYPE(epel_h48, 8, avx2);
 +PEL_PROTOTYPE(epel_h64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_h16,10, avx2);
 +PEL_PROTOTYPE(epel_h24,10, avx2);
 +PEL_PROTOTYPE(epel_h32,10, avx2);
 +PEL_PROTOTYPE(epel_h48,10, avx2);
 +PEL_PROTOTYPE(epel_h64,10, avx2);
 +
 +PEL_PROTOTYPE(epel_v16, 8, avx2);
 +PEL_PROTOTYPE(epel_v24, 8, avx2);
 +PEL_PROTOTYPE(epel_v32, 8, avx2);
 +PEL_PROTOTYPE(epel_v48, 8, avx2);
 +PEL_PROTOTYPE(epel_v64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_v16,10, avx2);
 +PEL_PROTOTYPE(epel_v24,10, avx2);
 +PEL_PROTOTYPE(epel_v32,10, avx2);
 +PEL_PROTOTYPE(epel_v48,10, avx2);
 +PEL_PROTOTYPE(epel_v64,10, avx2);
 +
 +PEL_PROTOTYPE(epel_hv16, 8, avx2);
 +PEL_PROTOTYPE(epel_hv24, 8, avx2);
 +PEL_PROTOTYPE(epel_hv32, 8, avx2);
 +PEL_PROTOTYPE(epel_hv48, 8, avx2);
 +PEL_PROTOTYPE(epel_hv64, 8, avx2);
 +
 +PEL_PROTOTYPE(epel_hv16,10, avx2);
 +PEL_PROTOTYPE(epel_hv24,10, avx2);
 +PEL_PROTOTYPE(epel_hv32,10, avx2);
 +PEL_PROTOTYPE(epel_hv48,10, avx2);
 +PEL_PROTOTYPE(epel_hv64,10, avx2);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// QPEL
 +///////////////////////////////////////////////////////////////////////////////
 +QPEL_PROTOTYPES(qpel_h ,  8, sse4);
 +QPEL_PROTOTYPES(qpel_h , 10, sse4);
 +QPEL_PROTOTYPES(qpel_h , 12, sse4);
 +
 +QPEL_PROTOTYPES(qpel_v,  8, sse4);
 +QPEL_PROTOTYPES(qpel_v, 10, sse4);
 +QPEL_PROTOTYPES(qpel_v, 12, sse4);
 +
 +QPEL_PROTOTYPES(qpel_hv,  8, sse4);
 +QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 +QPEL_PROTOTYPES(qpel_hv, 12, sse4);
 +
 +PEL_PROTOTYPE(qpel_h16, 8, avx2);
 +PEL_PROTOTYPE(qpel_h24, 8, avx2);
 +PEL_PROTOTYPE(qpel_h32, 8, avx2);
 +PEL_PROTOTYPE(qpel_h48, 8, avx2);
 +PEL_PROTOTYPE(qpel_h64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_h16,10, avx2);
 +PEL_PROTOTYPE(qpel_h24,10, avx2);
 +PEL_PROTOTYPE(qpel_h32,10, avx2);
 +PEL_PROTOTYPE(qpel_h48,10, avx2);
 +PEL_PROTOTYPE(qpel_h64,10, avx2);
 +
 +PEL_PROTOTYPE(qpel_v16, 8, avx2);
 +PEL_PROTOTYPE(qpel_v24, 8, avx2);
 +PEL_PROTOTYPE(qpel_v32, 8, avx2);
 +PEL_PROTOTYPE(qpel_v48, 8, avx2);
 +PEL_PROTOTYPE(qpel_v64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_v16,10, avx2);
 +PEL_PROTOTYPE(qpel_v24,10, avx2);
 +PEL_PROTOTYPE(qpel_v32,10, avx2);
 +PEL_PROTOTYPE(qpel_v48,10, avx2);
 +PEL_PROTOTYPE(qpel_v64,10, avx2);
 +
 +PEL_PROTOTYPE(qpel_hv16, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv24, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv32, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv48, 8, avx2);
 +PEL_PROTOTYPE(qpel_hv64, 8, avx2);
 +
 +PEL_PROTOTYPE(qpel_hv16,10, avx2);
 +PEL_PROTOTYPE(qpel_hv24,10, avx2);
 +PEL_PROTOTYPE(qpel_hv32,10, avx2);
 +PEL_PROTOTYPE(qpel_hv48,10, avx2);
 +PEL_PROTOTYPE(qpel_hv64,10, avx2);
 +
 +WEIGHTING_PROTOTYPES(8, sse4);
 +WEIGHTING_PROTOTYPES(10, sse4);
 +WEIGHTING_PROTOTYPES(12, sse4);
 +
 +///////////////////////////////////////////////////////////////////////////////
 +// TRANSFORM_ADD
 +///////////////////////////////////////////////////////////////////////////////
- void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
- void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
- void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
- void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
- void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
 +
 +#endif // AVCODEC_X86_HEVCDSP_H
diff --cc libavcodec/x86/hevcdsp_init.c
index 09eb06d,fd22fc3..da73d76
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@@ -696,419 -207,102 +696,419 @@@ void ff_hevc_dsp_init_x86(HEVCDSPContex
  {
      int cpu_flags = av_get_cpu_flags();
  
 -#define SET_LUMA_FUNCS(tabname, funcname, depth, cf)      \
 -    c->tabname[0] = funcname ## _4_  ## depth ## _ ## cf; \
 -    c->tabname[1] = funcname ## _8_  ## depth ## _ ## cf; \
 -    c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
 -    c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
 -    c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
 -    c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
 -    c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
 -    c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
 -
 -#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf)    \
 -    c->tabname[1] = funcname ## _4_  ## depth ## _ ## cf; \
 -    c->tabname[3] = funcname ## _8_  ## depth ## _ ## cf; \
 -    c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
 -    c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
 -    c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
 -    c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
 -
 -#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS  (put_hevc_qpel[v][h], name, depth, cf)
 -#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
 -
      if (bit_depth == 8) {
 +        if (EXTERNAL_MMXEXT(cpu_flags)) {
 +            c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
 +            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
-             c->transform_add[0]    =  ff_hevc_transform_add4_8_mmxext;
++            c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
 +        }
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
 +
 +            }
 +            SAO_BAND_INIT(8, sse2);
  
 -            SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
 +            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
 +            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
 +            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
  
-             c->transform_add[1]    = ff_hevc_transform_add8_8_sse2;
-             c->transform_add[2]    = ff_hevc_transform_add16_8_sse2;
-             c->transform_add[3]    = ff_hevc_transform_add32_8_sse2;
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     8, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
++            c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
++            c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
++            c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
          }
          if (EXTERNAL_SSSE3(cpu_flags)) {
 -            SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
 -            SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
 +            if(ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +            }
 +            SAO_EDGE_INIT(8, ssse3);
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
 +        }
 +        if (EXTERNAL_AVX(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
 +            }
 +            SAO_BAND_INIT(8, avx);
 +
-             c->transform_add[1]    = ff_hevc_transform_add8_8_avx;
-             c->transform_add[2]    = ff_hevc_transform_add16_8_avx;
-             c->transform_add[3]    = ff_hevc_transform_add32_8_avx;
++            c->add_residual[1] = ff_hevc_add_residual8_8_avx;
++            c->add_residual[2] = ff_hevc_add_residual16_8_avx;
++            c->add_residual[3] = ff_hevc_add_residual32_8_avx;
 +        }
 +        if (EXTERNAL_AVX2(cpu_flags)) {
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
 +            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
 +            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
 +
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
 +
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
 +
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
 +
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
 +
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
 +            }
 +            SAO_BAND_INIT(8, avx2);
 +
 +            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
 +            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
 +            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
 +
-             c->transform_add[3]    = ff_hevc_transform_add32_8_avx2;
++            c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
          }
      } else if (bit_depth == 10) {
 +        if (EXTERNAL_MMXEXT(cpu_flags)) {
-             c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
++            c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
 +            c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
 +            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
 +        }
          if (EXTERNAL_SSE2(cpu_flags)) {
              c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
              c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
 +            }
 +            SAO_BAND_INIT(10, sse2);
 +            SAO_EDGE_INIT(10, sse2);
  
 -            SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 -            SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 +            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
 +            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
 +            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
  
-             c->transform_add[1]    = ff_hevc_transform_add8_10_sse2;
-             c->transform_add[2]    = ff_hevc_transform_add16_10_sse2;
-             c->transform_add[3]    = ff_hevc_transform_add32_10_sse2;
 -            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     10, sse2);
 -            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
++            c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
++            c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
++            c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
          }
 -    }
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
 +        }
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
  
 -#if ARCH_X86_64
 -    if (bit_depth == 8) {
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
 +        }
 +        if (EXTERNAL_AVX(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
 +            }
 +            SAO_BAND_INIT(10, avx);
          }
 +        if (EXTERNAL_AVX2(cpu_flags)) {
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
 +            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
 +            if (ARCH_X86_64) {
 +                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
 +                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
 +                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
 +                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
 +                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
 +
 +                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
 +                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
 +                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
 +                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
 +                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
 +                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
 +                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
 +                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
 +                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
 +                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
 +                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
 +                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
 +                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
 +                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
 +                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
 +                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
 +                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
 +
 +                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
 +                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
 +                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
 +                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
 +                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
 +                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
 +                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
 +                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
 +                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
 +
 +                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
 +                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
 +                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
 +                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
 +                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
 +                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
 +                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
 +                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
 +                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
  
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     8, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 8, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
 +                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
 +                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
 +                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
 +                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
 +                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
 +                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
 +                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
 +                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
 +                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
 +
 +                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
 +
 +                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
 +                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
 +                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
 +                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
 +                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
 +            }
 +            SAO_BAND_INIT(10, avx2);
 +            SAO_EDGE_INIT(10, avx2);
 +
-             c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
-             c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
++            c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
++            c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
 +
 +        }
 +    } else if (bit_depth == 12) {
 +        if (EXTERNAL_MMXEXT(cpu_flags)) {
 +            c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
 +            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
          }
 +        if (EXTERNAL_SSE2(cpu_flags)) {
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
 +            }
 +            SAO_BAND_INIT(12, sse2);
 +            SAO_EDGE_INIT(12, sse2);
  
 -        if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 +            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
 +            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
 +            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
          }
 -    } else if (bit_depth == 10) {
 -        if (EXTERNAL_SSSE3(cpu_flags)) {
 -            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
 -            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
 +        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
 +            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
 +            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
          }
 -        if (EXTERNAL_SSE4(cpu_flags)) {
 -            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     10, sse4);
 -            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 10, sse4);
 -            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
 +        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
 +            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
 +            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
 +
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
 +            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
          }
          if (EXTERNAL_AVX(cpu_flags)) {
 -#if HAVE_AVX_EXTERNAL
 -            SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
 -            SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
 -            SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
 -            SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
 -            SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
 -            SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
 -#endif /* HAVE_AVX_EXTERNAL */
 +            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
 +            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
 +            if (ARCH_X86_64) {
 +                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
 +                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
 +            }
 +            SAO_BAND_INIT(12, avx);
 +        }
 +        if (EXTERNAL_AVX2(cpu_flags)) {
 +            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
 +        }
 +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 +            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
 +            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
 +
 +            SAO_BAND_INIT(12, avx2);
 +            SAO_EDGE_INIT(12, avx2);
          }
      }
 -#endif /* ARCH_X86_64 */
  }