[FFmpeg-cvslog] Merge commit '1bd890ad173d79e7906c5e1d06bf0a06cca4519d'
Clément Bœsch
git at videolan.org
Tue Jan 31 16:39:46 EET 2017
ffmpeg | branch: master | Clément Bœsch <cboesch at gopro.com> | Tue Jan 31 11:20:54 2017 +0100| [d0e132bab68073588dc55844a31b053fb0ee1c83] | committer: Clément Bœsch
Merge commit '1bd890ad173d79e7906c5e1d06bf0a06cca4519d'
* commit '1bd890ad173d79e7906c5e1d06bf0a06cca4519d':
hevc: Separate adding residual to prediction from IDCT
This commit should be a noop but isn't because of the following renames:
- transform_add → add_residual
- transform_skip → dequant
- idct_4x4_luma → transform_4x4_luma
Merged-by: Clément Bœsch <cboesch at gopro.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d0e132bab68073588dc55844a31b053fb0ee1c83
---
libavcodec/arm/hevcdsp_idct_neon.S | 8 ++++----
libavcodec/arm/hevcdsp_init_neon.c | 26 ++++++++++++-------------
libavcodec/hevc.c | 4 ++--
libavcodec/hevc_cabac.c | 6 +++---
libavcodec/hevcdsp.c | 12 ++++++------
libavcodec/hevcdsp.h | 6 +++---
libavcodec/hevcdsp_template.c | 39 ++++++++++++++++---------------------
libavcodec/mips/hevcdsp_init_mips.c | 10 +++++-----
libavcodec/x86/hevc_res_add.asm | 36 +++++++++++++++++-----------------
libavcodec/x86/hevcdsp.h | 28 +++++++++++++-------------
libavcodec/x86/hevcdsp_init.c | 28 +++++++++++++-------------
11 files changed, 99 insertions(+), 104 deletions(-)
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
index 13d540e..e39d006 100644
--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -97,7 +97,7 @@ function ff_hevc_idct_32x32_dc_neon_8, export=1
bx lr
endfunc
-function ff_hevc_transform_add_4x4_neon_8, export=1
+function ff_hevc_add_residual_4x4_neon_8, export=1
vldm r1, {q0-q1}
vld1.32 d4[0], [r0], r2
vld1.32 d4[1], [r0], r2
@@ -117,7 +117,7 @@ function ff_hevc_transform_add_4x4_neon_8, export=1
bx lr
endfunc
-function ff_hevc_transform_add_8x8_neon_8, export=1
+function ff_hevc_add_residual_8x8_neon_8, export=1
mov r3, #8
1: subs r3, #1
vld1.16 {q0}, [r1]!
@@ -130,7 +130,7 @@ function ff_hevc_transform_add_8x8_neon_8, export=1
bx lr
endfunc
-function ff_hevc_transform_add_16x16_neon_8, export=1
+function ff_hevc_add_residual_16x16_neon_8, export=1
mov r3, #16
1: subs r3, #1
vld1.16 {q0, q1}, [r1]!
@@ -146,7 +146,7 @@ function ff_hevc_transform_add_16x16_neon_8, export=1
bx lr
endfunc
-function ff_hevc_transform_add_32x32_neon_8, export=1
+function ff_hevc_add_residual_32x32_neon_8, export=1
mov r3, #32
1: subs r3, #1
vldm r1!, {q0-q3}
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
index 5591807..1a3912c 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -34,14 +34,14 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
-void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
-void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
-void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
-void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
#define PUT_PIXELS(name) \
void name(int16_t *dst, uint8_t *src, \
@@ -156,11 +156,11 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_8;
- c->transform_add[0] = ff_hevc_transform_add_4x4_neon_8;
- c->transform_add[1] = ff_hevc_transform_add_8x8_neon_8;
- c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
- c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
- c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
+ c->add_residual[0] = ff_hevc_add_residual_4x4_neon_8;
+ c->add_residual[1] = ff_hevc_add_residual_8x8_neon_8;
+ c->add_residual[2] = ff_hevc_add_residual_16x16_neon_8;
+ c->add_residual[3] = ff_hevc_add_residual_32x32_neon_8;
+ c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index 607a8da..505249e 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -1052,7 +1052,7 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
for (i = 0; i < (size * size); i++) {
coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
}
- s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
}
}
@@ -1081,7 +1081,7 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
for (i = 0; i < (size * size); i++) {
coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
}
- s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
}
}
} else if (s->ps.sps->chroma_format_idc && blk_idx == 3) {
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index 05b2821..457e087 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -1476,7 +1476,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
}
- s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+ s->hevcdsp.dequant(coeffs, log2_trafo_size);
if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
lc->cu.pred_mode == MODE_INTRA &&
@@ -1486,7 +1486,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
}
} else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
- s->hevcdsp.idct_4x4_luma(coeffs);
+ s->hevcdsp.transform_4x4_luma(coeffs);
} else {
int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
if (max_xy == 0)
@@ -1510,7 +1510,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
}
}
- s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+ s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
}
void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 9d773d9..23e923f 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -195,13 +195,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
#define HEVC_DSP(depth) \
hevcdsp->put_pcm = FUNC(put_pcm, depth); \
- hevcdsp->transform_add[0] = FUNC(transform_add4x4, depth); \
- hevcdsp->transform_add[1] = FUNC(transform_add8x8, depth); \
- hevcdsp->transform_add[2] = FUNC(transform_add16x16, depth); \
- hevcdsp->transform_add[3] = FUNC(transform_add32x32, depth); \
- hevcdsp->transform_skip = FUNC(transform_skip, depth); \
+ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \
+ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \
+ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \
+ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \
+ hevcdsp->dequant = FUNC(dequant, depth); \
hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \
- hevcdsp->idct_4x4_luma = FUNC(transform_4x4_luma, depth); \
+ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \
hevcdsp->idct[0] = FUNC(idct_4x4, depth); \
hevcdsp->idct[1] = FUNC(idct_8x8, depth); \
hevcdsp->idct[2] = FUNC(idct_16x16, depth); \
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 9f1f6dd..3b7e737 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -46,13 +46,13 @@ typedef struct HEVCDSPContext {
void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
struct GetBitContext *gb, int pcm_bit_depth);
- void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+ void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
- void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+ void (*dequant)(int16_t *coeffs, int16_t log2_size);
void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
- void (*idct_4x4_luma)(int16_t *coeffs);
+ void (*transform_4x4_luma)(int16_t *coeffs);
void (*idct[4])(int16_t *coeffs, int col_limit);
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 78eb354..66b1ac0 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -42,8 +42,8 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
}
}
-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride, int size)
+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride, int size)
{
int x, y;
pixel *dst = (pixel *)_dst;
@@ -52,35 +52,35 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
for (y = 0; y < size; y++) {
for (x = 0; x < size; x++) {
- dst[x] = av_clip_pixel(dst[x] + *coeffs);
- coeffs++;
+ dst[x] = av_clip_pixel(dst[x] + *res);
+ res++;
}
dst += stride;
}
}
-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+ FUNC(add_residual)(_dst, res, stride, 4);
}
-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
+ FUNC(add_residual)(_dst, res, stride, 8);
}
-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
+ FUNC(add_residual)(_dst, res, stride, 16);
}
-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+ FUNC(add_residual)(_dst, res, stride, 32);
}
@@ -106,13 +106,11 @@ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
}
}
-static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
{
int shift = 15 - BIT_DEPTH - log2_size;
int x, y;
int size = 1 << log2_size;
- int16_t *coeffs = _coeffs;
-
if (shift > 0) {
int offset = 1 << (shift - 1);
@@ -134,8 +132,6 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
#define SET(dst, x) (dst) = (x)
#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
-#define ADD_AND_SCALE(dst, x) \
- (dst) = av_clip_pixel((dst) + av_clip_int16(((x) + add) >> shift))
#define TR_4x4_LUMA(dst, src, step, assign) \
do { \
@@ -299,7 +295,6 @@ IDCT_DC(32)
#undef SET
#undef SCALE
-#undef ADD_AND_SCALE
static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
ptrdiff_t stride_dst, ptrdiff_t stride_src,
diff --git a/libavcodec/mips/hevcdsp_init_mips.c b/libavcodec/mips/hevcdsp_init_mips.c
index 3675b93..776d13e 100644
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -437,11 +437,11 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
c->idct_dc[1] = ff_hevc_idct_dc_8x8_msa;
c->idct_dc[2] = ff_hevc_idct_dc_16x16_msa;
c->idct_dc[3] = ff_hevc_idct_dc_32x32_msa;
- c->transform_add[0] = ff_hevc_addblk_4x4_msa;
- c->transform_add[1] = ff_hevc_addblk_8x8_msa;
- c->transform_add[2] = ff_hevc_addblk_16x16_msa;
- c->transform_add[3] = ff_hevc_addblk_32x32_msa;
- c->idct_4x4_luma = ff_hevc_idct_luma_4x4_msa;
+ c->add_residual[0] = ff_hevc_addblk_4x4_msa;
+ c->add_residual[1] = ff_hevc_addblk_8x8_msa;
+ c->add_residual[2] = ff_hevc_addblk_16x16_msa;
+ c->add_residual[3] = ff_hevc_addblk_32x32_msa;
+ c->transform_4x4_luma = ff_hevc_idct_luma_4x4_msa;
}
}
#endif // #if HAVE_MSA
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
index dc3e88a..869288f 100644
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -1,5 +1,5 @@
; /*
-; * Provide SIMD optimizations for transform_add functions for HEVC decoding
+; * Provide SIMD optimizations for add_residual functions for HEVC decoding
; * Copyright (c) 2014 Pierre-Edouard LEPERE
; *
; * This file is part of FFmpeg.
@@ -52,7 +52,7 @@ cextern pw_1023
INIT_MMX mmxext
; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add4_8, 3, 4, 6
+cglobal hevc_add_residual4_8, 3, 4, 6
TR_ADD_MMX_4_8
add r1, 16
lea r0, [r0+r2*2]
@@ -135,8 +135,8 @@ cglobal hevc_transform_add4_8, 3, 4, 6
%macro TRANSFORM_ADD_8 0
-; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add8_8, 3, 4, 8
+; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_add_residual8_8, 3, 4, 8
lea r3, [r2*3]
TR_ADD_SSE_8_8
add r1, 64
@@ -144,8 +144,8 @@ cglobal hevc_transform_add8_8, 3, 4, 8
TR_ADD_SSE_8_8
RET
-; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add16_8, 3, 4, 7
+; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_add_residual16_8, 3, 4, 7
pxor m0, m0
lea r3, [r2*3]
TR_ADD_SSE_16_32_8 0, r0, r0+r2
@@ -158,8 +158,8 @@ cglobal hevc_transform_add16_8, 3, 4, 7
%endrep
RET
-; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add32_8, 3, 4, 7
+; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_add_residual32_8, 3, 4, 7
pxor m0, m0
TR_ADD_SSE_16_32_8 0, r0, r0+16
TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
@@ -179,8 +179,8 @@ TRANSFORM_ADD_8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_transform_add32_8, 3, 4, 7
+; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_add_residual32_8, 3, 4, 7
pxor m0, m0
lea r3, [r2*3]
TR_ADD_SSE_16_32_8 0, r0, r0+r2
@@ -195,7 +195,7 @@ cglobal hevc_transform_add32_8, 3, 4, 7
%endif
;-----------------------------------------------------------------------------
-; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
%macro TR_ADD_SSE_8_10 4
mova m0, [%4]
@@ -310,7 +310,7 @@ cglobal hevc_transform_add32_8, 3, 4, 7
INIT_MMX mmxext
-cglobal hevc_transform_add4_10,3,4, 6
+cglobal hevc_add_residual4_10,3,4, 6
pxor m2, m2
mova m3, [max_pixels_10]
TR_ADD_MMX4_10 r0, r2, r1
@@ -320,10 +320,10 @@ cglobal hevc_transform_add4_10,3,4, 6
RET
;-----------------------------------------------------------------------------
-; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal hevc_transform_add8_10,3,4,6
+cglobal hevc_add_residual8_10,3,4,6
pxor m4, m4
mova m5, [max_pixels_10]
lea r3, [r2*3]
@@ -334,7 +334,7 @@ cglobal hevc_transform_add8_10,3,4,6
TR_ADD_SSE_8_10 r0, r2, r3, r1
RET
-cglobal hevc_transform_add16_10,3,4,6
+cglobal hevc_add_residual16_10,3,4,6
pxor m4, m4
mova m5, [max_pixels_10]
@@ -346,7 +346,7 @@ cglobal hevc_transform_add16_10,3,4,6
%endrep
RET
-cglobal hevc_transform_add32_10,3,4,6
+cglobal hevc_add_residual32_10,3,4,6
pxor m4, m4
mova m5, [max_pixels_10]
@@ -361,7 +361,7 @@ cglobal hevc_transform_add32_10,3,4,6
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-cglobal hevc_transform_add16_10,3,4,6
+cglobal hevc_add_residual16_10,3,4,6
pxor m4, m4
mova m5, [max_pixels_10]
lea r3, [r2*3]
@@ -374,7 +374,7 @@ cglobal hevc_transform_add16_10,3,4,6
%endrep
RET
-cglobal hevc_transform_add32_10,3,4,6
+cglobal hevc_add_residual32_10,3,4,6
pxor m4, m4
mova m5, [max_pixels_10]
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index ad8168f..3cfdc27 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -239,23 +239,23 @@ WEIGHTING_PROTOTYPES(12, sse4);
///////////////////////////////////////////////////////////////////////////////
// TRANSFORM_ADD
///////////////////////////////////////////////////////////////////////////////
-void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
#endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 09eb06d..da73d76 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -700,7 +700,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
- c->transform_add[0] = ff_hevc_transform_add4_8_mmxext;
+ c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
@@ -716,9 +716,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
- c->transform_add[1] = ff_hevc_transform_add8_8_sse2;
- c->transform_add[2] = ff_hevc_transform_add16_8_sse2;
- c->transform_add[3] = ff_hevc_transform_add32_8_sse2;
+ c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
+ c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
+ c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
if(ARCH_X86_64) {
@@ -748,9 +748,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
}
SAO_BAND_INIT(8, avx);
- c->transform_add[1] = ff_hevc_transform_add8_8_avx;
- c->transform_add[2] = ff_hevc_transform_add16_8_avx;
- c->transform_add[3] = ff_hevc_transform_add32_8_avx;
+ c->add_residual[1] = ff_hevc_add_residual8_8_avx;
+ c->add_residual[2] = ff_hevc_add_residual16_8_avx;
+ c->add_residual[3] = ff_hevc_add_residual32_8_avx;
}
if (EXTERNAL_AVX2(cpu_flags)) {
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
@@ -850,11 +850,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
- c->transform_add[3] = ff_hevc_transform_add32_8_avx2;
+ c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
+ c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
}
@@ -872,9 +872,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
- c->transform_add[1] = ff_hevc_transform_add8_10_sse2;
- c->transform_add[2] = ff_hevc_transform_add16_10_sse2;
- c->transform_add[3] = ff_hevc_transform_add32_10_sse2;
+ c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
+ c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
+ c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@@ -1053,8 +1053,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
SAO_BAND_INIT(10, avx2);
SAO_EDGE_INIT(10, avx2);
- c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
- c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
+ c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
+ c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
}
} else if (bit_depth == 12) {
======================================================================
diff --cc libavcodec/arm/hevcdsp_idct_neon.S
index 13d540e,0000000..e39d006
mode 100644,000000..100644
--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@@ -1,465 -1,0 +1,465 @@@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi at vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+function ff_hevc_idct_4x4_dc_neon_8, export=1
+ ldrsh r1, [r0]
+ ldr r2, =0x20
+ add r1, #1
+ asr r1, #1
+ add r1, r2
+ asr r1, #6
+ vdup.16 q0, r1
+ vdup.16 q1, r1
+ vst1.16 {q0, q1}, [r0]
+ bx lr
+endfunc
+
+function ff_hevc_idct_8x8_dc_neon_8, export=1
+ ldrsh r1, [r0]
+ ldr r2, =0x20
+ add r1, #1
+ asr r1, #1
+ add r1, r2
+ asr r1, #6
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+ vstm r0, {q8-q15}
+ bx lr
+endfunc
+
+function ff_hevc_idct_16x16_dc_neon_8, export=1
+ ldrsh r1, [r0]
+ ldr r2, =0x20
+ add r1, #1
+ asr r1, #1
+ add r1, r2
+ asr r1, #6
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+ vstm r0!, {q8-q15}
+ vstm r0!, {q8-q15}
+ vstm r0!, {q8-q15}
+ vstm r0, {q8-q15}
+ bx lr
+endfunc
+
+function ff_hevc_idct_32x32_dc_neon_8, export=1
+ ldrsh r1, [r0]
+ ldr r2, =0x20
+ add r1, #1
+ asr r1, #1
+ add r1, r2
+ asr r1, #6
+ mov r3, #16
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+1: subs r3, #1
+ vstm r0!, {q8-q15}
+ bne 1b
+ bx lr
+endfunc
+
- function ff_hevc_transform_add_4x4_neon_8, export=1
++function ff_hevc_add_residual_4x4_neon_8, export=1
+ vldm r1, {q0-q1}
+ vld1.32 d4[0], [r0], r2
+ vld1.32 d4[1], [r0], r2
+ vld1.32 d5[0], [r0], r2
+ vld1.32 d5[1], [r0], r2
+ sub r0, r0, r2, lsl #2
+ vmovl.u8 q8, d4
+ vmovl.u8 q9, d5
+ vqadd.s16 q0, q0, q8
+ vqadd.s16 q1, q1, q9
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.32 d0[0], [r0], r2
+ vst1.32 d0[1], [r0], r2
+ vst1.32 d1[0], [r0], r2
+ vst1.32 d1[1], [r0], r2
+ bx lr
+endfunc
+
- function ff_hevc_transform_add_8x8_neon_8, export=1
++function ff_hevc_add_residual_8x8_neon_8, export=1
+ mov r3, #8
+1: subs r3, #1
+ vld1.16 {q0}, [r1]!
+ vld1.8 d16, [r0]
+ vmovl.u8 q8, d16
+ vqadd.s16 q0, q8
+ vqmovun.s16 d0, q0
+ vst1.32 d0, [r0], r2
+ bne 1b
+ bx lr
+endfunc
+
- function ff_hevc_transform_add_16x16_neon_8, export=1
++function ff_hevc_add_residual_16x16_neon_8, export=1
+ mov r3, #16
+1: subs r3, #1
+ vld1.16 {q0, q1}, [r1]!
+ vld1.8 {q8}, [r0]
+ vmovl.u8 q9, d16
+ vmovl.u8 q10, d17
+ vqadd.s16 q0, q9
+ vqadd.s16 q1, q10
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.8 {q0}, [r0], r2
+ bne 1b
+ bx lr
+endfunc
+
- function ff_hevc_transform_add_32x32_neon_8, export=1
++function ff_hevc_add_residual_32x32_neon_8, export=1
+ mov r3, #32
+1: subs r3, #1
+ vldm r1!, {q0-q3}
+ vld1.8 {q8, q9}, [r0]
+ vmovl.u8 q10, d16
+ vmovl.u8 q11, d17
+ vmovl.u8 q12, d18
+ vmovl.u8 q13, d19
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vqadd.s16 q2, q12
+ vqadd.s16 q3, q13
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vst1.8 {q0, q1}, [r0], r2
+ bne 1b
+ bx lr
+endfunc
+
+.macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7
+ vtrn.64 \r0, \r4
+ vtrn.64 \r1, \r5
+ vtrn.64 \r2, \r6
+ vtrn.64 \r3, \r7
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.32 \r4, \r6
+ vtrn.32 \r5, \r7
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+.endm
+
+// in 4 q regs
+// output 8 d regs
+.macro transpose_16b_4x4 r0, r1, r2, r3
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+/* uses registers q2 - q9 for temp values */
+/* TODO: reorder */
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+ vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2
+ vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3
+ vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3
+ vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1
+
+ vaddl.s16 q7, \r0, \r3 // src0 + src3
+ vsubw.s16 q7, q7, \r2 // src0 - src2 + src3
+ vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3)
+
+ vmul.s32 q8, q5, d0[1] // 29 * c0
+ vmul.s32 q9, q2, d1[0] // 55 * c1
+ vadd.s32 q8, q9 // 29 * c0 + 55 * c1
+ vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3
+
+ vmul.s32 q2, q2, d0[1] // 29 * c1
+ vmul.s32 q9, q4, d1[0] // 55 * c2
+ vsub.s32 q9, q2 // 55 * c2 - 29 * c1
+ vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3
+
+ vmul.s32 q5, q5, d1[0] // 55 * c0
+ vmul.s32 q4, q4, d0[1] // 29 * c2
+ vadd.s32 q5, q4 // 55 * c0 + 29 * c2
+ vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3
+
+ vqrshrn.s32 \r0, q8, \shift
+ vqrshrn.s32 \r1, q9, \shift
+ vqrshrn.s32 \r2, q7, \shift
+ vqrshrn.s32 \r3, q5, \shift
+.endm
+
+/* uses registers q2 - q6 for temp values */
+.macro tr4 r0, r1, r2, r3
+ vmull.s16 q4, \r1, d0[0] // 83 * src1
+ vmull.s16 q6, \r1, d0[1] // 36 * src1
+ vshll.s16 q2, \r0, #6 // 64 * src0
+ vshll.s16 q3, \r2, #6 // 64 * src2
+ vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
+ vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
+ vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
+ vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
+
+ vsub.s32 q3, q5, q4 // e0 - o0
+ vadd.s32 q4, q5, q4 // e0 + o0
+ vadd.s32 q5, q2, q6 // e1 + o1
+ vsub.s32 q6, q2, q6 // e1 - o1
+.endm
+
+.macro tr4_shift r0, r1, r2, r3, shift
+ vmull.s16 q4, \r1, d0[0] // 83 * src1
+ vmull.s16 q6, \r1, d0[1] // 36 * src1
+ vshll.s16 q2, \r0, #6 // 64 * src0
+ vshll.s16 q3, \r2, #6 // 64 * src2
+ vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
+ vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
+ vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
+ vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
+
+ vsub.s32 q3, q5, q4 // e0 - o0
+ vadd.s32 q4, q5, q4 // e0 + o0
+ vadd.s32 q5, q2, q6 // e1 + o1
+ vsub.s32 q6, q2, q6 // e1 - o1
+
+ vqrshrn.s32 \r0, q4, \shift
+ vqrshrn.s32 \r1, q5, \shift
+ vqrshrn.s32 \r2, q6, \shift
+ vqrshrn.s32 \r3, q3, \shift
+.endm
+
+function ff_hevc_transform_4x4_neon_8, export=1
+ vpush {d8-d15}
+ vld1.16 {q14, q15}, [r0] // coeffs
+ ldr r3, =0x00240053 // 36 and 83
+ vmov.32 d0[0], r3
+
+ tr4_shift d28, d29, d30, d31, #7
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ tr4_shift d28, d29, d30, d31, #12
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ vst1.16 {q14, q15}, [r0]
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+ vpush {d8-d15}
+ vld1.16 {q14, q15}, [r0] // coeffs
+ ldr r3, =0x4a // 74
+ vmov.32 d0[0], r3
+ ldr r3, =0x1d // 29
+ vmov.32 d0[1], r3
+ ldr r3, =0x37 // 55
+ vmov.32 d1[0], r3
+
+ tr4_luma_shift d28, d29, d30, d31, #7
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ tr4_luma_shift d28, d29, d30, d31, #12
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+ vst1.16 {q14, q15}, [r0]
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+.macro tr8_begin in0, in1, in2, in3
+ vmull.s16 q7, \in0, d1[1] // 89 * src1
+ vmull.s16 q8, \in0, d1[0] // 75 * src1
+ vmull.s16 q9, \in0, d1[3] // 50 * src1
+ vmull.s16 q10, \in0, d1[2] // 18 * src1
+
+ vmlal.s16 q7, \in1, d1[0] // 75 * src3
+ vmlsl.s16 q8, \in1, d1[2] //-18 * src3
+ vmlsl.s16 q9, \in1, d1[1] //-89 * src3
+ vmlsl.s16 q10, \in1, d1[3] //-50 * src3
+
+ vmlal.s16 q7, \in2, d1[3] // 50 * src5
+ vmlsl.s16 q8, \in2, d1[1] //-89 * src5
+ vmlal.s16 q9, \in2, d1[2] // 18 * src5
+ vmlal.s16 q10, \in2, d1[0] // 75 * src5
+
+ vmlal.s16 q7, \in3, d1[2] // 18 * src7
+ vmlsl.s16 q8, \in3, d1[3] //-50 * src7
+ vmlal.s16 q9, \in3, d1[0] // 75 * src7
+ vmlsl.s16 q10, \in3, d1[1] //-89 * src7
+.endm
+
+.macro tr8_end shift
+ vadd.s32 q1, q4, q7 // e_8[0] + o_8[0], dst[0]
+ vsub.s32 q4, q4, q7 // e_8[0] - o_8[0], dst[7]
+
+ vadd.s32 q2, q5, q8 // e_8[1] + o_8[1], dst[1]
+ vsub.s32 q5, q5, q8 // e_8[1] - o_8[1], dst[6]
+
+ vadd.s32 q11, q6, q9 // e_8[2] + o_8[2], dst[2]
+ vsub.s32 q6, q6, q9 // e_8[2] - o_8[2], dst[5]
+
+ vadd.s32 q12, q3, q10 // e_8[3] + o_8[3], dst[3]
+ vsub.s32 q3, q3, q10 // e_8[3] - o_8[3], dst[4]
+ vqrshrn.s32 d2, q1, \shift
+ vqrshrn.s32 d3, q2, \shift
+ vqrshrn.s32 d4, q11, \shift
+ vqrshrn.s32 d5, q12, \shift
+ vqrshrn.s32 d6, q3, \shift
+ vqrshrn.s32 d7, q6, \shift
+ vqrshrn.s32 d9, q4, \shift
+ vqrshrn.s32 d8, q5, \shift
+.endm
+
+function ff_hevc_transform_8x8_neon_8, export=1
+ push {r4-r8}
+ vpush {d8-d15}
+ mov r5, #16
+
+ adr r3, tr4f
+ vld1.16 {d0, d1}, [r3]
+
+ // left half
+ vld1.16 {d24}, [r0], r5
+ vld1.16 {d25}, [r0], r5
+ vld1.16 {d26}, [r0], r5
+ vld1.16 {d27}, [r0], r5
+ vld1.16 {d28}, [r0], r5
+ vld1.16 {d29}, [r0], r5
+ vld1.16 {d30}, [r0], r5
+ vld1.16 {d31}, [r0], r5
+ sub r0, #128
+ tr8_begin d25, d27, d29, d31
+ tr4 d24, d26, d28, d30
+ tr8_end #7
+ vst1.16 {d2}, [r0], r5
+ vst1.16 {d3}, [r0], r5
+ vst1.16 {d4}, [r0], r5
+ vst1.16 {d5}, [r0], r5
+ vst1.16 {d6}, [r0], r5
+ vst1.16 {d7}, [r0], r5
+ vst1.16 {d8}, [r0], r5
+ vst1.16 {d9}, [r0], r5
+ sub r0, #128
+ //skip right half if col_limit in r1 is less than 4
+ cmp r1, #4
+ blt 1f
+ //right half
+ add r0, #8
+ vld1.16 {d24}, [r0], r5
+ vld1.16 {d25}, [r0], r5
+ vld1.16 {d26}, [r0], r5
+ vld1.16 {d27}, [r0], r5
+ vld1.16 {d28}, [r0], r5
+ vld1.16 {d29}, [r0], r5
+ vld1.16 {d30}, [r0], r5
+ vld1.16 {d31}, [r0], r5
+ sub r0, #128
+ tr8_begin d25, d27, d29, d31
+ tr4 d24, d26, d28, d30
+ tr8_end #7
+ vst1.16 {d2}, [r0], r5
+ vst1.16 {d3}, [r0], r5
+ vst1.16 {d4}, [r0], r5
+ vst1.16 {d5}, [r0], r5
+ vst1.16 {d6}, [r0], r5
+ vst1.16 {d7}, [r0], r5
+ vst1.16 {d8}, [r0], r5
+ vst1.16 {d9}, [r0], r5
+ sub r0, #136
+1:
+ // top half
+ vldm r0, {q12-q15} // coeffs
+ transpose_16b_4x4 d24, d26, d28, d30
+ transpose_16b_4x4 d25, d27, d29, d31
+ tr8_begin d26, d30, d27, d31
+ tr4 d24, d28, d25, d29
+ tr8_end #12
+ transpose_16b_4x4 d2, d3, d4, d5
+ transpose_16b_4x4 d6, d7, d8, d9
+ vswp d7, d5
+ vswp d7, d8
+ vswp d3, d6
+ vswp d6, d4
+ vstm r0!, {q1-q4}
+
+ // bottom half
+ vldm r0, {q12-q15} // coeffs
+ transpose_16b_4x4 d24, d26, d28, d30
+ transpose_16b_4x4 d25, d27, d29, d31
+ tr8_begin d26, d30, d27, d31
+ tr4 d24, d28, d25, d29
+ tr8_end #12
+ transpose_16b_4x4 d2, d3, d4, d5
+ transpose_16b_4x4 d6, d7, d8, d9
+ vswp d7, d5
+ vswp d7, d8
+ vswp d3, d6
+ vswp d6, d4
+ //vstm r0, {q1-q4}
+ vst1.16 {q1-q2}, [r0]
+ add r0, #32
+ vst1.16 {q3-q4}, [r0]
+ sub r0, #32
+ vpop {d8-d15}
+ pop {r4-r8}
+ bx lr
+endfunc
+
+.align 4
+tr4f:
+.word 0x00240053 // 36 and d1[0] = 83
+.word 0x00000000
+tr8f:
+.word 0x0059004b // 89, d0[0] = 75
+.word 0x00320012 // 50, d0[2] = 18
+tr16:
+.word 0x005a0057 // 90, d2[0] = 87
+.word 0x00500046 // 80, d2[2] = 70
+.word 0x0039002b // 57, d2[0] = 43
+.word 0x00190009 // 25, d2[2] = 9
diff --cc libavcodec/arm/hevcdsp_init_neon.c
index 5591807,0000000..1a3912c
mode 100644,000000..100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@@ -1,224 -1,0 +1,224 @@@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi at vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
- void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
- void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
- void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
- void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
+
+#define PUT_PIXELS(name) \
+ void name(int16_t *dst, uint8_t *src, \
+ ptrdiff_t srcstride, int height, \
+ intptr_t mx, intptr_t my, int width)
+PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+#undef PUT_PIXELS
+
+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, int width);
+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int16_t* src2, ptrdiff_t src2stride);
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width);
+#define QPEL_FUNC(name) \
+ void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+ int height, int width)
+
+QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#define QPEL_FUNC_UW_PIX(name) \
+ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+ int height, intptr_t mx, intptr_t my, int width);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
+#undef QPEL_FUNC_UW_PIX
+
+#define QPEL_FUNC_UW(name) \
+ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+ int width, int height, int16_t* src2, ptrdiff_t src2stride);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+#undef QPEL_FUNC_UW
+
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width) {
+
+ put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
+}
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width) {
+
+ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width) {
+ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+ if (bit_depth == 8) {
+ int x;
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon;
+ c->idct[0] = ff_hevc_transform_4x4_neon_8;
+ c->idct[1] = ff_hevc_transform_8x8_neon_8;
+ c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8;
+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8;
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_8;
- c->transform_add[0] = ff_hevc_transform_add_4x4_neon_8;
- c->transform_add[1] = ff_hevc_transform_add_8x8_neon_8;
- c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
- c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
- c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
++ c->add_residual[0] = ff_hevc_add_residual_4x4_neon_8;
++ c->add_residual[1] = ff_hevc_add_residual_8x8_neon_8;
++ c->add_residual[2] = ff_hevc_add_residual_16x16_neon_8;
++ c->add_residual[3] = ff_hevc_add_residual_32x32_neon_8;
++ c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
+ put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
+ put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
+ put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
+ put_hevc_qpel_neon[0][1] = ff_hevc_put_qpel_h1_neon_8;
+ put_hevc_qpel_neon[0][2] = ff_hevc_put_qpel_h2_neon_8;
+ put_hevc_qpel_neon[0][3] = ff_hevc_put_qpel_h3_neon_8;
+ put_hevc_qpel_neon[1][1] = ff_hevc_put_qpel_h1v1_neon_8;
+ put_hevc_qpel_neon[1][2] = ff_hevc_put_qpel_h2v1_neon_8;
+ put_hevc_qpel_neon[1][3] = ff_hevc_put_qpel_h3v1_neon_8;
+ put_hevc_qpel_neon[2][1] = ff_hevc_put_qpel_h1v2_neon_8;
+ put_hevc_qpel_neon[2][2] = ff_hevc_put_qpel_h2v2_neon_8;
+ put_hevc_qpel_neon[2][3] = ff_hevc_put_qpel_h3v2_neon_8;
+ put_hevc_qpel_neon[3][1] = ff_hevc_put_qpel_h1v3_neon_8;
+ put_hevc_qpel_neon[3][2] = ff_hevc_put_qpel_h2v3_neon_8;
+ put_hevc_qpel_neon[3][3] = ff_hevc_put_qpel_h3v3_neon_8;
+ put_hevc_qpel_uw_neon[1][0] = ff_hevc_put_qpel_uw_v1_neon_8;
+ put_hevc_qpel_uw_neon[2][0] = ff_hevc_put_qpel_uw_v2_neon_8;
+ put_hevc_qpel_uw_neon[3][0] = ff_hevc_put_qpel_uw_v3_neon_8;
+ put_hevc_qpel_uw_neon[0][1] = ff_hevc_put_qpel_uw_h1_neon_8;
+ put_hevc_qpel_uw_neon[0][2] = ff_hevc_put_qpel_uw_h2_neon_8;
+ put_hevc_qpel_uw_neon[0][3] = ff_hevc_put_qpel_uw_h3_neon_8;
+ put_hevc_qpel_uw_neon[1][1] = ff_hevc_put_qpel_uw_h1v1_neon_8;
+ put_hevc_qpel_uw_neon[1][2] = ff_hevc_put_qpel_uw_h2v1_neon_8;
+ put_hevc_qpel_uw_neon[1][3] = ff_hevc_put_qpel_uw_h3v1_neon_8;
+ put_hevc_qpel_uw_neon[2][1] = ff_hevc_put_qpel_uw_h1v2_neon_8;
+ put_hevc_qpel_uw_neon[2][2] = ff_hevc_put_qpel_uw_h2v2_neon_8;
+ put_hevc_qpel_uw_neon[2][3] = ff_hevc_put_qpel_uw_h3v2_neon_8;
+ put_hevc_qpel_uw_neon[3][1] = ff_hevc_put_qpel_uw_h1v3_neon_8;
+ put_hevc_qpel_uw_neon[3][2] = ff_hevc_put_qpel_uw_h2v3_neon_8;
+ put_hevc_qpel_uw_neon[3][3] = ff_hevc_put_qpel_uw_h3v3_neon_8;
+ for (x = 0; x < 10; x++) {
+ c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel_uni[x][1][0] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_uni[x][0][1] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_uni[x][1][1] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper;
+ }
+ c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
+ c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
+ c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
+ c->put_hevc_qpel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
+ c->put_hevc_qpel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
+
+ c->put_hevc_qpel_uni[1][0][0] = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
+ c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
+ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
+ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+ }
+}
diff --cc libavcodec/hevc.c
index 607a8da,d5d3f59..505249e
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@@ -1016,127 -1289,20 +1016,127 @@@ static int hls_transform_unit(HEVCConte
}
}
+ lc->tu.cross_pf = 0;
+
if (cbf_luma)
- hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
- if (log2_trafo_size > 2) {
- if (cbf_cb)
- hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 1);
- if (cbf_cr)
- hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 2);
+ ff_hevc_hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
+ if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
+ int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+ int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+ lc->tu.cross_pf = (s->ps.pps->cross_component_prediction_enabled_flag && cbf_luma &&
+ (lc->cu.pred_mode == MODE_INTER ||
+ (lc->tu.chroma_mode_c == 4)));
+
+ if (lc->tu.cross_pf) {
+ hls_cross_component_pred(s, 0);
+ }
+ for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+ if (lc->cu.pred_mode == MODE_INTRA) {
+ ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+ }
+ if (cbf_cb[i])
+ ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+ log2_trafo_size_c, scan_idx_c, 1);
+ else
+ if (lc->tu.cross_pf) {
+ ptrdiff_t stride = s->frame->linesize[1];
+ int hshift = s->ps.sps->hshift[1];
+ int vshift = s->ps.sps->vshift[1];
+ int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+ int16_t *coeffs = (int16_t*)lc->edge_emu_buffer2;
+ int size = 1 << log2_trafo_size_c;
+
+ uint8_t *dst = &s->frame->data[1][(y0 >> vshift) * stride +
+ ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+ for (i = 0; i < (size * size); i++) {
+ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+ }
- s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
++ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
+ }
+ }
+
+ if (lc->tu.cross_pf) {
+ hls_cross_component_pred(s, 1);
+ }
+ for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+ if (lc->cu.pred_mode == MODE_INTRA) {
+ ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+ }
+ if (cbf_cr[i])
+ ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+ log2_trafo_size_c, scan_idx_c, 2);
+ else
+ if (lc->tu.cross_pf) {
+ ptrdiff_t stride = s->frame->linesize[2];
+ int hshift = s->ps.sps->hshift[2];
+ int vshift = s->ps.sps->vshift[2];
+ int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+ int16_t *coeffs = (int16_t*)lc->edge_emu_buffer2;
+ int size = 1 << log2_trafo_size_c;
+
+ uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride +
+ ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+ for (i = 0; i < (size * size); i++) {
+ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+ }
- s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
++ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
+ }
+ }
+ } else if (s->ps.sps->chroma_format_idc && blk_idx == 3) {
+ int trafo_size_h = 1 << (log2_trafo_size + 1);
+ int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+ for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+ if (lc->cu.pred_mode == MODE_INTRA) {
+ ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+ trafo_size_h, trafo_size_v);
+ s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+ }
+ if (cbf_cb[i])
+ ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+ log2_trafo_size, scan_idx_c, 1);
+ }
+ for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+ if (lc->cu.pred_mode == MODE_INTRA) {
+ ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+ trafo_size_h, trafo_size_v);
+ s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+ }
+ if (cbf_cr[i])
+ ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+ log2_trafo_size, scan_idx_c, 2);
+ }
+ }
+ } else if (s->ps.sps->chroma_format_idc && lc->cu.pred_mode == MODE_INTRA) {
+ if (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3) {
+ int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+ int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+ ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+ if (s->ps.sps->chroma_format_idc == 2) {
+ ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+ trafo_size_h, trafo_size_v);
+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+ }
} else if (blk_idx == 3) {
- if (cbf_cb)
- hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 1);
- if (cbf_cr)
- hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 2);
+ int trafo_size_h = 1 << (log2_trafo_size + 1);
+ int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+ ff_hevc_set_neighbour_available(s, xBase, yBase,
+ trafo_size_h, trafo_size_v);
+ s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+ s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+ if (s->ps.sps->chroma_format_idc == 2) {
+ ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+ trafo_size_h, trafo_size_v);
+ s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+ s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+ }
}
}
+
return 0;
}
diff --cc libavcodec/hevc_cabac.c
index 05b2821,b01808f..457e087
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@@ -999,541 -867,6 +999,541 @@@ static av_always_inline int coeff_sign_
int ret = 0;
for (i = 0; i < nb; i++)
- ret = (ret << 1) | get_cabac_bypass(&s->HEVClc.cc);
+ ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
return ret;
}
+
+void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+ int log2_trafo_size, enum ScanType scan_idx,
+ int c_idx)
+{
+#define GET_COORD(offset, n) \
+ do { \
+ x_c = (x_cg << 2) + scan_x_off[n]; \
+ y_c = (y_cg << 2) + scan_y_off[n]; \
+ } while (0)
+ HEVCLocalContext *lc = s->HEVClc;
+ int transform_skip_flag = 0;
+
+ int last_significant_coeff_x, last_significant_coeff_y;
+ int last_scan_pos;
+ int n_end;
+ int num_coeff = 0;
+ int greater1_ctx = 1;
+
+ int num_last_subset;
+ int x_cg_last_sig, y_cg_last_sig;
+
+ const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+
+ ptrdiff_t stride = s->frame->linesize[c_idx];
+ int hshift = s->ps.sps->hshift[c_idx];
+ int vshift = s->ps.sps->vshift[c_idx];
+ uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+ ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+ int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+ uint8_t significant_coeff_group_flag[8][8] = {{0}};
+ int explicit_rdpcm_flag = 0;
+ int explicit_rdpcm_dir_flag;
+
+ int trafo_size = 1 << log2_trafo_size;
+ int i;
+ int qp,shift,add,scale,scale_m;
+ static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+ const uint8_t *scale_matrix = NULL;
+ uint8_t dc_scale;
+ int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+ lc->tu.intra_pred_mode_c;
+
+ memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+
+ // Derive QP for dequant
+ if (!lc->cu.cu_transquant_bypass_flag) {
+ static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+ static const uint8_t rem6[51 + 4 * 6 + 1] = {
+ 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+ 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+ 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+ 4, 5, 0, 1, 2, 3, 4, 5, 0, 1
+ };
+
+ static const uint8_t div6[51 + 4 * 6 + 1] = {
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3,
+ 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+ 10, 10, 11, 11, 11, 11, 11, 11, 12, 12
+ };
+ int qp_y = lc->qp_y;
+
+ if (s->ps.pps->transform_skip_enabled_flag &&
+ log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
+ transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+ }
+
+ if (c_idx == 0) {
+ qp = qp_y + s->ps.sps->qp_bd_offset;
+ } else {
+ int qp_i, offset;
+
+ if (c_idx == 1)
+ offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset +
+ lc->tu.cu_qp_offset_cb;
+ else
+ offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset +
+ lc->tu.cu_qp_offset_cr;
+
+ qp_i = av_clip(qp_y + offset, - s->ps.sps->qp_bd_offset, 57);
+ if (s->ps.sps->chroma_format_idc == 1) {
+ if (qp_i < 30)
+ qp = qp_i;
+ else if (qp_i > 43)
+ qp = qp_i - 6;
+ else
+ qp = qp_c[qp_i - 30];
+ } else {
+ if (qp_i > 51)
+ qp = 51;
+ else
+ qp = qp_i;
+ }
+
+ qp += s->ps.sps->qp_bd_offset;
+ }
+
+ shift = s->ps.sps->bit_depth + log2_trafo_size - 5;
+ add = 1 << (shift-1);
+ scale = level_scale[rem6[qp]] << (div6[qp]);
+ scale_m = 16; // default when no custom scaling lists.
+ dc_scale = 16;
+
+ if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+ const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
+ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+ int matrix_id = lc->cu.pred_mode != MODE_INTRA;
+
+ matrix_id = 3 * matrix_id + c_idx;
+
+ scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+ if (log2_trafo_size >= 4)
+ dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+ }
+ } else {
+ shift = 0;
+ add = 0;
+ scale = 0;
+ dc_scale = 0;
+ }
+
+ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+ (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+ if (explicit_rdpcm_flag) {
+ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+ }
+ }
+
+ last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+ &last_significant_coeff_x, &last_significant_coeff_y);
+
+ if (last_significant_coeff_x > 3) {
+ int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
+ last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
+ (2 + (last_significant_coeff_x & 1)) +
+ suffix;
+ }
+
+ if (last_significant_coeff_y > 3) {
+ int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
+ last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
+ (2 + (last_significant_coeff_y & 1)) +
+ suffix;
+ }
+
+ if (scan_idx == SCAN_VERT)
+ FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
+
+ x_cg_last_sig = last_significant_coeff_x >> 2;
+ y_cg_last_sig = last_significant_coeff_y >> 2;
+
+ switch (scan_idx) {
+ case SCAN_DIAG: {
+ int last_x_c = last_significant_coeff_x & 3;
+ int last_y_c = last_significant_coeff_y & 3;
+
+ scan_x_off = ff_hevc_diag_scan4x4_x;
+ scan_y_off = ff_hevc_diag_scan4x4_y;
+ num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+ if (trafo_size == 4) {
+ scan_x_cg = scan_1x1;
+ scan_y_cg = scan_1x1;
+ } else if (trafo_size == 8) {
+ num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+ scan_x_cg = diag_scan2x2_x;
+ scan_y_cg = diag_scan2x2_y;
+ } else if (trafo_size == 16) {
+ num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+ scan_x_cg = ff_hevc_diag_scan4x4_x;
+ scan_y_cg = ff_hevc_diag_scan4x4_y;
+ } else { // trafo_size == 32
+ num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+ scan_x_cg = ff_hevc_diag_scan8x8_x;
+ scan_y_cg = ff_hevc_diag_scan8x8_y;
+ }
+ break;
+ }
+ case SCAN_HORIZ:
+ scan_x_cg = horiz_scan2x2_x;
+ scan_y_cg = horiz_scan2x2_y;
+ scan_x_off = horiz_scan4x4_x;
+ scan_y_off = horiz_scan4x4_y;
+ num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+ break;
+ default: //SCAN_VERT
+ scan_x_cg = horiz_scan2x2_y;
+ scan_y_cg = horiz_scan2x2_x;
+ scan_x_off = horiz_scan4x4_y;
+ scan_y_off = horiz_scan4x4_x;
+ num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+ break;
+ }
+ num_coeff++;
+ num_last_subset = (num_coeff - 1) >> 4;
+
+ for (i = num_last_subset; i >= 0; i--) {
+ int n, m;
+ int x_cg, y_cg, x_c, y_c, pos;
+ int implicit_non_zero_coeff = 0;
+ int64_t trans_coeff_level;
+ int prev_sig = 0;
+ int offset = i << 4;
+ int rice_init = 0;
+
+ uint8_t significant_coeff_flag_idx[16];
+ uint8_t nb_significant_coeff_flag = 0;
+
+ x_cg = scan_x_cg[i];
+ y_cg = scan_y_cg[i];
+
+ if ((i < num_last_subset) && (i > 0)) {
+ int ctx_cg = 0;
+ if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+ ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+ if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+ ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+
+ significant_coeff_group_flag[x_cg][y_cg] =
+ significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+ implicit_non_zero_coeff = 1;
+ } else {
+ significant_coeff_group_flag[x_cg][y_cg] =
+ ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+ (x_cg == 0 && y_cg == 0));
+ }
+
+ last_scan_pos = num_coeff - offset - 1;
+
+ if (i == num_last_subset) {
+ n_end = last_scan_pos - 1;
+ significant_coeff_flag_idx[0] = last_scan_pos;
+ nb_significant_coeff_flag = 1;
+ } else {
+ n_end = 15;
+ }
+
+ if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
+ prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
+ if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
+ prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
+
+ if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
+ static const uint8_t ctx_idx_map[] = {
+ 0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
+ 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
+ 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
+ 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 // default
+ };
+ const uint8_t *ctx_idx_map_p;
+ int scf_offset = 0;
+ if (s->ps.sps->transform_skip_context_enabled_flag &&
+ (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+ ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
+ if (c_idx == 0) {
+ scf_offset = 40;
+ } else {
+ scf_offset = 14 + 27;
+ }
+ } else {
+ if (c_idx != 0)
+ scf_offset = 27;
+ if (log2_trafo_size == 2) {
+ ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+ } else {
+ ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
+ if (c_idx == 0) {
+ if ((x_cg > 0 || y_cg > 0))
+ scf_offset += 3;
+ if (log2_trafo_size == 3) {
+ scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+ } else {
+ scf_offset += 21;
+ }
+ } else {
+ if (log2_trafo_size == 3)
+ scf_offset += 9;
+ else
+ scf_offset += 12;
+ }
+ }
+ }
+ for (n = n_end; n > 0; n--) {
+ x_c = scan_x_off[n];
+ y_c = scan_y_off[n];
+ if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
+ significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+ nb_significant_coeff_flag++;
+ implicit_non_zero_coeff = 0;
+ }
+ }
+ if (implicit_non_zero_coeff == 0) {
+ if (s->ps.sps->transform_skip_context_enabled_flag &&
+ (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+ if (c_idx == 0) {
+ scf_offset = 42;
+ } else {
+ scf_offset = 16 + 27;
+ }
+ } else {
+ if (i == 0) {
+ if (c_idx == 0)
+ scf_offset = 0;
+ else
+ scf_offset = 27;
+ } else {
+ scf_offset = 2 + scf_offset;
+ }
+ }
+ if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+ nb_significant_coeff_flag++;
+ }
+ } else {
+ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+ nb_significant_coeff_flag++;
+ }
+ }
+
+ n_end = nb_significant_coeff_flag;
+
+
+ if (n_end) {
+ int first_nz_pos_in_cg;
+ int last_nz_pos_in_cg;
+ int c_rice_param = 0;
+ int first_greater1_coeff_idx = -1;
+ uint8_t coeff_abs_level_greater1_flag[8];
+ uint16_t coeff_sign_flag;
+ int sum_abs = 0;
+ int sign_hidden;
+ int sb_type;
+
+
+ // initialize first elem of coeff_bas_level_greater1_flag
+ int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+
+ if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+ if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+ sb_type = 2 * (c_idx == 0 ? 1 : 0);
+ else
+ sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+ c_rice_param = lc->stat_coeff[sb_type] / 4;
+ }
+
+ if (!(i == num_last_subset) && greater1_ctx == 0)
+ ctx_set++;
+ greater1_ctx = 1;
+ last_nz_pos_in_cg = significant_coeff_flag_idx[0];
+
+ for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
+ int inc = (ctx_set << 2) + greater1_ctx;
+ coeff_abs_level_greater1_flag[m] =
+ coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
+ if (coeff_abs_level_greater1_flag[m]) {
+ greater1_ctx = 0;
+ if (first_greater1_coeff_idx == -1)
+ first_greater1_coeff_idx = m;
+ } else if (greater1_ctx > 0 && greater1_ctx < 3) {
+ greater1_ctx++;
+ }
+ }
+ first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
+
+ if (lc->cu.cu_transquant_bypass_flag ||
+ (lc->cu.pred_mode == MODE_INTRA &&
+ s->ps.sps->implicit_rdpcm_enabled_flag && transform_skip_flag &&
+ (pred_mode_intra == 10 || pred_mode_intra == 26 )) ||
+ explicit_rdpcm_flag)
+ sign_hidden = 0;
+ else
+ sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+
+ if (first_greater1_coeff_idx != -1) {
+ coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
+ }
+ if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
+ coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
+ } else {
+ coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
+ }
+
+ for (m = 0; m < n_end; m++) {
+ n = significant_coeff_flag_idx[m];
+ GET_COORD(offset, n);
+ if (m < 8) {
+ trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
+ if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
+ int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+
+ trans_coeff_level += last_coeff_abs_level_remaining;
+ if (trans_coeff_level > (3 << c_rice_param))
+ c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+ if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+ int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+ if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+ lc->stat_coeff[sb_type]++;
+ else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+ if (lc->stat_coeff[sb_type] > 0)
+ lc->stat_coeff[sb_type]--;
+ rice_init = 1;
+ }
+ }
+ } else {
+ int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+
+ trans_coeff_level = 1 + last_coeff_abs_level_remaining;
+ if (trans_coeff_level > (3 << c_rice_param))
+ c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+ if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+ int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+ if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+ lc->stat_coeff[sb_type]++;
+ else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+ if (lc->stat_coeff[sb_type] > 0)
+ lc->stat_coeff[sb_type]--;
+ rice_init = 1;
+ }
+ }
+ if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
+ sum_abs += trans_coeff_level;
+ if (n == first_nz_pos_in_cg && (sum_abs&1))
+ trans_coeff_level = -trans_coeff_level;
+ }
+ if (coeff_sign_flag >> 15)
+ trans_coeff_level = -trans_coeff_level;
+ coeff_sign_flag <<= 1;
+ if(!lc->cu.cu_transquant_bypass_flag) {
+ if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+ if(y_c || x_c || log2_trafo_size < 4) {
+ switch(log2_trafo_size) {
+ case 3: pos = (y_c << 3) + x_c; break;
+ case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
+ case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
+ default: pos = (y_c << 2) + x_c; break;
+ }
+ scale_m = scale_matrix[pos];
+ } else {
+ scale_m = dc_scale;
+ }
+ }
+ trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
+ if(trans_coeff_level < 0) {
+ if((~trans_coeff_level) & 0xFffffffffff8000)
+ trans_coeff_level = -32768;
+ } else {
+ if(trans_coeff_level & 0xffffffffffff8000)
+ trans_coeff_level = 32767;
+ }
+ }
+ coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+ }
+ }
+ }
+
+ if (lc->cu.cu_transquant_bypass_flag) {
+ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+ (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+ int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
+
+ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+ }
+ } else {
+ if (transform_skip_flag) {
+ int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+ log2_trafo_size == 2 &&
+ lc->cu.pred_mode == MODE_INTRA;
+ if (rot) {
+ for (i = 0; i < 8; i++)
+ FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+ }
+
- s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
++ s->hevcdsp.dequant(coeffs, log2_trafo_size);
+
+ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+ lc->cu.pred_mode == MODE_INTRA &&
+ (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+ int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
+
+ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+ }
+ } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
- s->hevcdsp.idct_4x4_luma(coeffs);
++ s->hevcdsp.transform_4x4_luma(coeffs);
+ } else {
+ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+ if (max_xy == 0)
+ s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+ else {
+ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+ if (max_xy < 4)
+ col_limit = FFMIN(4, col_limit);
+ else if (max_xy < 8)
+ col_limit = FFMIN(8, col_limit);
+ else if (max_xy < 12)
+ col_limit = FFMIN(24, col_limit);
+ s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+ }
+ }
+ }
+ if (lc->tu.cross_pf) {
+ int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+
+ for (i = 0; i < (trafo_size * trafo_size); i++) {
+ coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+ }
+ }
- s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
++ s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
+}
+
+void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+ HEVCLocalContext *lc = s->HEVClc;
+ int x = abs_mvd_greater0_flag_decode(s);
+ int y = abs_mvd_greater0_flag_decode(s);
+
+ if (x)
+ x += abs_mvd_greater1_flag_decode(s);
+ if (y)
+ y += abs_mvd_greater1_flag_decode(s);
+
+ switch (x) {
+ case 2: lc->pu.mvd.x = mvd_decode(s); break;
+ case 1: lc->pu.mvd.x = mvd_sign_flag_decode(s); break;
+ case 0: lc->pu.mvd.x = 0; break;
+ }
+
+ switch (y) {
+ case 2: lc->pu.mvd.y = mvd_decode(s); break;
+ case 1: lc->pu.mvd.y = mvd_sign_flag_decode(s); break;
+ case 0: lc->pu.mvd.y = 0; break;
+ }
+}
+
diff --cc libavcodec/hevcdsp.c
index 9d773d9,4bd3d97..23e923f
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@@ -195,13 -164,12 +195,13 @@@ void ff_hevc_dsp_init(HEVCDSPContext *h
#define HEVC_DSP(depth) \
hevcdsp->put_pcm = FUNC(put_pcm, depth); \
- hevcdsp->transform_add[0] = FUNC(transform_add4x4, depth); \
- hevcdsp->transform_add[1] = FUNC(transform_add8x8, depth); \
- hevcdsp->transform_add[2] = FUNC(transform_add16x16, depth); \
- hevcdsp->transform_add[3] = FUNC(transform_add32x32, depth); \
- hevcdsp->transform_skip = FUNC(transform_skip, depth); \
+ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \
+ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \
+ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \
+ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \
+ hevcdsp->dequant = FUNC(dequant, depth); \
+ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \
- hevcdsp->idct_4x4_luma = FUNC(transform_4x4_luma, depth); \
+ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \
hevcdsp->idct[0] = FUNC(idct_4x4, depth); \
hevcdsp->idct[1] = FUNC(idct_8x8, depth); \
hevcdsp->idct[2] = FUNC(idct_16x16, depth); \
diff --cc libavcodec/hevcdsp.h
index 9f1f6dd,decd1c9..3b7e737
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@@ -43,82 -39,76 +43,82 @@@ typedef struct SAOParams
} SAOParams;
typedef struct HEVCDSPContext {
- void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
- GetBitContext *gb, int pcm_bit_depth);
+ void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+ struct GetBitContext *gb, int pcm_bit_depth);
+
- void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
++ void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+
- void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
++ void (*dequant)(int16_t *coeffs, int16_t log2_size);
- void (*add_residual[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+ void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
- void (*idct_4x4_luma)(int16_t *coeffs);
- void (*dequant)(int16_t *coeffs);
+ void (*transform_4x4_luma)(int16_t *coeffs);
- void (*idct[4])(int16_t *coeffs);
-
- void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
- struct SAOParams *sao, int *borders,
- int width, int height, int c_idx);
- void (*sao_edge_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
- struct SAOParams *sao, int *borders, int width,
- int height, int c_idx, uint8_t vert_edge,
- uint8_t horiz_edge, uint8_t diag_edge);
-
- void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
- ptrdiff_t srcstride, int height,
- int mx, int my, int16_t *mcbuffer);
- void (*put_hevc_epel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
- ptrdiff_t srcstride, int height,
- int mx, int my, int16_t *mcbuffer);
-
- void (*put_unweighted_pred[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
- ptrdiff_t srcstride, int height);
- void (*put_unweighted_pred_chroma[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
- ptrdiff_t srcstride, int height);
- void (*put_unweighted_pred_avg[8])(uint8_t *dst, ptrdiff_t dststride,
- int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride, int height);
- void (*put_unweighted_pred_avg_chroma[8])(uint8_t *dst, ptrdiff_t dststride,
- int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride, int height);
- void (*weighted_pred[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
- uint8_t *dst, ptrdiff_t dststride, int16_t *src,
- ptrdiff_t srcstride, int height);
- void (*weighted_pred_chroma[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
- uint8_t *dst, ptrdiff_t dststride, int16_t *src,
- ptrdiff_t srcstride, int height);
- void (*weighted_pred_avg[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
- int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
- ptrdiff_t dststride, int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride, int height);
- void (*weighted_pred_avg_chroma[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
- int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
- ptrdiff_t dststride, int16_t *src1, int16_t *src2,
- ptrdiff_t srcstride, int height);
+
+ void (*idct[4])(int16_t *coeffs, int col_limit);
+
+ void (*idct_dc[4])(int16_t *coeffs);
+
+ void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+ /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+ void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+ int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+
+ void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+ struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+
+ void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+
+ void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int16_t *src2,
+ int height, int denom, int wx0, int wx1,
+ int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+
+ void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width);
+ void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int16_t *src2,
+ int height, int denom, int wx0, int ox0, int wx1,
+ int ox1, intptr_t mx, intptr_t my, int width);
void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
- int beta, int *tc,
+ int beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
- int beta, int *tc,
+ int beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
- int *tc, uint8_t *no_p, uint8_t *no_q);
+ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
- int *tc, uint8_t *no_p, uint8_t *no_q);
+ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
- int beta, int *tc,
+ int beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
- int beta, int *tc,
+ int beta, int32_t *tc,
uint8_t *no_p, uint8_t *no_q);
void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
- int *tc, uint8_t *no_p,
+ int32_t *tc, uint8_t *no_p,
uint8_t *no_q);
void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
- int *tc, uint8_t *no_p,
+ int32_t *tc, uint8_t *no_p,
uint8_t *no_q);
} HEVCDSPContext;
diff --cc libavcodec/hevcdsp_template.c
index 78eb354,b4816db..66b1ac0
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@@ -59,76 -57,43 +59,74 @@@ static av_always_inline void FUNC(add_r
}
}
- static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+ static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+ FUNC(add_residual)(_dst, res, stride, 4);
}
- static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+ static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
+ FUNC(add_residual)(_dst, res, stride, 8);
}
- static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+ static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
+ FUNC(add_residual)(_dst, res, stride, 16);
}
- static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
- ptrdiff_t stride)
+ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
+ ptrdiff_t stride)
{
- FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+ FUNC(add_residual)(_dst, res, stride, 32);
}
-static void FUNC(dequant)(int16_t *coeffs)
+
+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
{
- int shift = 13 - BIT_DEPTH;
-#if BIT_DEPTH <= 13
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
+ int16_t *coeffs = (int16_t *) _coeffs;
int x, y;
+ int size = 1 << log2_size;
+
+ if (mode) {
+ coeffs += size;
+ for (y = 0; y < size - 1; y++) {
+ for (x = 0; x < size; x++)
+ coeffs[x] += coeffs[x - size];
+ coeffs += size;
+ }
+ } else {
+ for (y = 0; y < size; y++) {
+ for (x = 1; x < size; x++)
+ coeffs[x] += coeffs[x - 1];
+ coeffs += size;
+ }
+ }
+}
- static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
- for (y = 0; y < 4 * 4; y += 4) {
- for (x = 0; x < 4; x++)
- coeffs[y + x] = (coeffs[y + x] + offset) >> shift;
++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
+{
+ int shift = 15 - BIT_DEPTH - log2_size;
+ int x, y;
+ int size = 1 << log2_size;
- int16_t *coeffs = _coeffs;
-
+
+ if (shift > 0) {
+ int offset = 1 << (shift - 1);
+ for (y = 0; y < size; y++) {
+ for (x = 0; x < size; x++) {
+ *coeffs = (*coeffs + offset) >> shift;
+ coeffs++;
+ }
+ }
+ } else {
+ for (y = 0; y < size; y++) {
+ for (x = 0; x < size; x++) {
+ *coeffs = *coeffs << -shift;
+ coeffs++;
+ }
+ }
}
}
@@@ -174,137 -137,157 +170,136 @@@ static void FUNC(transform_4x4_luma)(in
#undef TR_4x4_LUMA
-#define TR_4(dst, src, dstep, sstep, assign) \
- do { \
- const int e0 = transform[8 * 0][0] * src[0 * sstep] + \
- transform[8 * 2][0] * src[2 * sstep]; \
- const int e1 = transform[8 * 0][1] * src[0 * sstep] + \
- transform[8 * 2][1] * src[2 * sstep]; \
- const int o0 = transform[8 * 1][0] * src[1 * sstep] + \
- transform[8 * 3][0] * src[3 * sstep]; \
- const int o1 = transform[8 * 1][1] * src[1 * sstep] + \
- transform[8 * 3][1] * src[3 * sstep]; \
- \
- assign(dst[0 * dstep], e0 + o0); \
- assign(dst[1 * dstep], e1 + o1); \
- assign(dst[2 * dstep], e1 - o1); \
- assign(dst[3 * dstep], e0 - o0); \
+#define TR_4(dst, src, dstep, sstep, assign, end) \
+ do { \
+ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
+ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
+ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
+ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
+ \
+ assign(dst[0 * dstep], e0 + o0); \
+ assign(dst[1 * dstep], e1 + o1); \
+ assign(dst[2 * dstep], e1 - o1); \
+ assign(dst[3 * dstep], e0 - o0); \
} while (0)
-static void FUNC(idct_4x4)(int16_t *coeffs)
-{
- int i;
- int shift = 7;
- int add = 1 << (shift - 1);
- int16_t *src = coeffs;
-
- for (i = 0; i < 4; i++) {
- TR_4(src, src, 4, 4, SCALE);
- src++;
- }
-
- shift = 20 - BIT_DEPTH;
- add = 1 << (shift - 1);
- for (i = 0; i < 4; i++) {
- TR_4(coeffs, coeffs, 1, 1, SCALE);
- coeffs += 4;
- }
-}
-
-#define TR_8(dst, src, dstep, sstep, assign) \
- do { \
- int i, j; \
- int e_8[4]; \
- int o_8[4] = { 0 }; \
- for (i = 0; i < 4; i++) \
- for (j = 1; j < 8; j += 2) \
- o_8[i] += transform[4 * j][i] * src[j * sstep]; \
- TR_4(e_8, src, 1, 2 * sstep, SET); \
- \
- for (i = 0; i < 4; i++) { \
- assign(dst[i * dstep], e_8[i] + o_8[i]); \
- assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \
- } \
+#define TR_8(dst, src, dstep, sstep, assign, end) \
+ do { \
+ int i, j; \
+ int e_8[4]; \
+ int o_8[4] = { 0 }; \
+ for (i = 0; i < 4; i++) \
+ for (j = 1; j < end; j += 2) \
+ o_8[i] += transform[4 * j][i] * src[j * sstep]; \
+ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \
+ \
+ for (i = 0; i < 4; i++) { \
+ assign(dst[i * dstep], e_8[i] + o_8[i]); \
+ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \
+ } \
} while (0)
-#define TR_16(dst, src, dstep, sstep, assign) \
- do { \
- int i, j; \
- int e_16[8]; \
- int o_16[8] = { 0 }; \
- for (i = 0; i < 8; i++) \
- for (j = 1; j < 16; j += 2) \
- o_16[i] += transform[2 * j][i] * src[j * sstep]; \
- TR_8(e_16, src, 1, 2 * sstep, SET); \
- \
- for (i = 0; i < 8; i++) { \
- assign(dst[i * dstep], e_16[i] + o_16[i]); \
- assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \
- } \
+#define TR_16(dst, src, dstep, sstep, assign, end) \
+ do { \
+ int i, j; \
+ int e_16[8]; \
+ int o_16[8] = { 0 }; \
+ for (i = 0; i < 8; i++) \
+ for (j = 1; j < end; j += 2) \
+ o_16[i] += transform[2 * j][i] * src[j * sstep]; \
+ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \
+ \
+ for (i = 0; i < 8; i++) { \
+ assign(dst[i * dstep], e_16[i] + o_16[i]); \
+ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \
+ } \
} while (0)
-#define TR_32(dst, src, dstep, sstep, assign) \
- do { \
- int i, j; \
- int e_32[16]; \
- int o_32[16] = { 0 }; \
- for (i = 0; i < 16; i++) \
- for (j = 1; j < 32; j += 2) \
- o_32[i] += transform[j][i] * src[j * sstep]; \
- TR_16(e_32, src, 1, 2 * sstep, SET); \
- \
- for (i = 0; i < 16; i++) { \
- assign(dst[i * dstep], e_32[i] + o_32[i]); \
- assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \
- } \
+#define TR_32(dst, src, dstep, sstep, assign, end) \
+ do { \
+ int i, j; \
+ int e_32[16]; \
+ int o_32[16] = { 0 }; \
+ for (i = 0; i < 16; i++) \
+ for (j = 1; j < end; j += 2) \
+ o_32[i] += transform[j][i] * src[j * sstep]; \
+ TR_16(e_32, src, 1, 2 * sstep, SET, end/2); \
+ \
+ for (i = 0; i < 16; i++) { \
+ assign(dst[i * dstep], e_32[i] + o_32[i]); \
+ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \
+ } \
} while (0)
-
-
-static void FUNC(idct_8x8)(int16_t *coeffs)
-{
- int i;
- int shift = 7;
- int add = 1 << (shift - 1);
- int16_t *src = coeffs;
-
- for (i = 0; i < 8; i++) {
- TR_8(src, src, 8, 8, SCALE);
- src++;
- }
-
- shift = 20 - BIT_DEPTH;
- add = 1 << (shift - 1);
- for (i = 0; i < 8; i++) {
- TR_8(coeffs, coeffs, 1, 1, SCALE);
- coeffs += 8;
- }
+#define IDCT_VAR4(H) \
+ int limit2 = FFMIN(col_limit + 4, H)
+#define IDCT_VAR8(H) \
+ int limit = FFMIN(col_limit, H); \
+ int limit2 = FFMIN(col_limit + 4, H)
+#define IDCT_VAR16(H) IDCT_VAR8(H)
+#define IDCT_VAR32(H) IDCT_VAR8(H)
+
+#define IDCT(H) \
+static void FUNC(idct_##H ##x ##H )( \
+ int16_t *coeffs, int col_limit) { \
+ int i; \
+ int shift = 7; \
+ int add = 1 << (shift - 1); \
+ int16_t *src = coeffs; \
+ IDCT_VAR ##H(H); \
+ \
+ for (i = 0; i < H; i++) { \
+ TR_ ## H(src, src, H, H, SCALE, limit2); \
+ if (limit2 < H && i%4 == 0 && !!i) \
+ limit2 -= 4; \
+ src++; \
+ } \
+ \
+ shift = 20 - BIT_DEPTH; \
+ add = 1 << (shift - 1); \
+ for (i = 0; i < H; i++) { \
+ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \
+ coeffs += H; \
+ } \
}
-static void FUNC(idct_16x16)(int16_t *coeffs)
-{
- int i;
- int shift = 7;
- int add = 1 << (shift - 1);
- int16_t *src = coeffs;
+#define IDCT_DC(H) \
+static void FUNC(idct_##H ##x ##H ##_dc)( \
+ int16_t *coeffs) { \
+ int i, j; \
+ int shift = 14 - BIT_DEPTH; \
+ int add = 1 << (shift - 1); \
+ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \
+ \
+ for (j = 0; j < H; j++) { \
+ for (i = 0; i < H; i++) { \
+ coeffs[i+j*H] = coeff; \
+ } \
+ } \
+}
- for (i = 0; i < 16; i++) {
- TR_16(src, src, 16, 16, SCALE);
- src++;
- }
+IDCT( 4)
+IDCT( 8)
+IDCT(16)
+IDCT(32)
- shift = 20 - BIT_DEPTH;
- add = 1 << (shift - 1);
- for (i = 0; i < 16; i++) {
- TR_16(coeffs, coeffs, 1, 1, SCALE);
- coeffs += 16;
- }
-}
+IDCT_DC( 4)
+IDCT_DC( 8)
+IDCT_DC(16)
+IDCT_DC(32)
-static void FUNC(idct_32x32)(int16_t *coeffs)
-{
- int i;
- int shift = 7;
- int add = 1 << (shift - 1);
- int16_t *src = coeffs;
+#undef TR_4
+#undef TR_8
+#undef TR_16
+#undef TR_32
- for (i = 0; i < 32; i++) {
- TR_32(src, src, 32, 32, SCALE);
- src++;
- }
- src = coeffs;
- shift = 20 - BIT_DEPTH;
- add = 1 << (shift - 1);
- for (i = 0; i < 32; i++) {
- TR_32(coeffs, coeffs, 1, 1, SCALE);
- coeffs += 32;
- }
-}
+#undef SET
+#undef SCALE
- #undef ADD_AND_SCALE
static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
- ptrdiff_t stride, SAOParams *sao,
- int *borders, int width, int height,
- int c_idx, int class)
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ int16_t *sao_offset_val, int sao_left_class,
+ int width, int height)
{
pixel *dst = (pixel *)_dst;
pixel *src = (pixel *)_src;
diff --cc libavcodec/mips/hevcdsp_init_mips.c
index 3675b93,0000000..776d13e
mode 100644,000000..100644
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@@ -1,454 -1,0 +1,454 @@@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale at imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
+ const int bit_depth)
+{
+ if (8 == bit_depth) {
+ c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
+ c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
+ c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
+ c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_msa;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_msa;
+
+ c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_msa;
+ c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_msa;
+ c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_msa;
+ c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_msa;
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_msa;
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_msa;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_msa;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_msa;
+
+ c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_msa;
+ c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_msa;
+ c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_msa;
+ c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_msa;
+ c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_msa;
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_msa;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_msa;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_msa;
+
+ c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_msa;
+ c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_msa;
+ c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_msa;
+ c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_msa;
+ c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_msa;
+ c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_msa;
+ c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_msa;
+ c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_msa;
+
+ c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
+ c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
+ c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
+ c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
+
+ c->put_hevc_epel[1][0][1] = ff_hevc_put_hevc_epel_h4_8_msa;
+ c->put_hevc_epel[2][0][1] = ff_hevc_put_hevc_epel_h6_8_msa;
+ c->put_hevc_epel[3][0][1] = ff_hevc_put_hevc_epel_h8_8_msa;
+ c->put_hevc_epel[4][0][1] = ff_hevc_put_hevc_epel_h12_8_msa;
+ c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_8_msa;
+ c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_8_msa;
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_msa;
+
+ c->put_hevc_epel[1][1][0] = ff_hevc_put_hevc_epel_v4_8_msa;
+ c->put_hevc_epel[2][1][0] = ff_hevc_put_hevc_epel_v6_8_msa;
+ c->put_hevc_epel[3][1][0] = ff_hevc_put_hevc_epel_v8_8_msa;
+ c->put_hevc_epel[4][1][0] = ff_hevc_put_hevc_epel_v12_8_msa;
+ c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_msa;
+ c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_msa;
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_msa;
+
+ c->put_hevc_epel[1][1][1] = ff_hevc_put_hevc_epel_hv4_8_msa;
+ c->put_hevc_epel[2][1][1] = ff_hevc_put_hevc_epel_hv6_8_msa;
+ c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_msa;
+ c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_msa;
+ c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_msa;
+ c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_msa;
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_msa;
+
+ c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+ c->put_hevc_qpel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_msa;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_msa;
+
+ c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_msa;
+ c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_msa;
+ c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_msa;
+ c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_msa;
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_msa;
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_msa;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_msa;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_msa;
+
+ c->put_hevc_qpel_uni[1][1][0] = ff_hevc_put_hevc_uni_qpel_v4_8_msa;
+ c->put_hevc_qpel_uni[3][1][0] = ff_hevc_put_hevc_uni_qpel_v8_8_msa;
+ c->put_hevc_qpel_uni[4][1][0] = ff_hevc_put_hevc_uni_qpel_v12_8_msa;
+ c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_8_msa;
+ c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_msa;
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa;
+
+ c->put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa;
+ c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa;
+ c->put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa;
+ c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa;
+ c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa;
+ c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
+ c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
+ c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
+
+ c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+ c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+ c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+ c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+
+ c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa;
+ c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa;
+ c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa;
+ c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa;
+ c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa;
+ c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa;
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa;
+
+ c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa;
+ c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa;
+ c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa;
+ c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa;
+ c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa;
+ c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa;
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa;
+
+ c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa;
+ c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa;
+ c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa;
+ c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa;
+ c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa;
+ c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa;
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa;
+
+ c->put_hevc_qpel_uni_w[1][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
+ c->put_hevc_qpel_uni_w[3][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
+ c->put_hevc_qpel_uni_w[4][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
+ c->put_hevc_qpel_uni_w[5][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
+ c->put_hevc_qpel_uni_w[6][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
+ c->put_hevc_qpel_uni_w[7][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
+ c->put_hevc_qpel_uni_w[8][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels48_8_msa;
+ c->put_hevc_qpel_uni_w[9][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels64_8_msa;
+
+ c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_qpel_h4_8_msa;
+ c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_qpel_h8_8_msa;
+ c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_qpel_h12_8_msa;
+ c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_qpel_h16_8_msa;
+ c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_qpel_h24_8_msa;
+ c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_qpel_h32_8_msa;
+ c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_uni_w_qpel_h48_8_msa;
+ c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_uni_w_qpel_h64_8_msa;
+
+ c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_qpel_v4_8_msa;
+ c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_qpel_v8_8_msa;
+ c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_qpel_v12_8_msa;
+ c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_qpel_v16_8_msa;
+ c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_qpel_v24_8_msa;
+ c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_qpel_v32_8_msa;
+ c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_uni_w_qpel_v48_8_msa;
+ c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_uni_w_qpel_v64_8_msa;
+
+ c->put_hevc_qpel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv4_8_msa;
+ c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_msa;
+ c->put_hevc_qpel_uni_w[4][1][1] =
+ ff_hevc_put_hevc_uni_w_qpel_hv12_8_msa;
+ c->put_hevc_qpel_uni_w[5][1][1] =
+ ff_hevc_put_hevc_uni_w_qpel_hv16_8_msa;
+ c->put_hevc_qpel_uni_w[6][1][1] =
+ ff_hevc_put_hevc_uni_w_qpel_hv24_8_msa;
+ c->put_hevc_qpel_uni_w[7][1][1] =
+ ff_hevc_put_hevc_uni_w_qpel_hv32_8_msa;
+ c->put_hevc_qpel_uni_w[8][1][1] =
+ ff_hevc_put_hevc_uni_w_qpel_hv48_8_msa;
+ c->put_hevc_qpel_uni_w[9][1][1] =
+ ff_hevc_put_hevc_uni_w_qpel_hv64_8_msa;
+
+ c->put_hevc_epel_uni_w[1][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
+ c->put_hevc_epel_uni_w[2][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels6_8_msa;
+ c->put_hevc_epel_uni_w[3][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
+ c->put_hevc_epel_uni_w[4][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
+ c->put_hevc_epel_uni_w[5][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
+ c->put_hevc_epel_uni_w[6][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
+ c->put_hevc_epel_uni_w[7][0][0] =
+ ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
+
+ c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_epel_h4_8_msa;
+ c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_uni_w_epel_h6_8_msa;
+ c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_epel_h8_8_msa;
+ c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_epel_h12_8_msa;
+ c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_epel_h16_8_msa;
+ c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_epel_h24_8_msa;
+ c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_epel_h32_8_msa;
+
+ c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_epel_v4_8_msa;
+ c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_uni_w_epel_v6_8_msa;
+ c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_epel_v8_8_msa;
+ c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_epel_v12_8_msa;
+ c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_epel_v16_8_msa;
+ c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_epel_v24_8_msa;
+ c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_epel_v32_8_msa;
+
+ c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_epel_hv4_8_msa;
+ c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_uni_w_epel_hv6_8_msa;
+ c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_epel_hv8_8_msa;
+ c->put_hevc_epel_uni_w[4][1][1] =
+ ff_hevc_put_hevc_uni_w_epel_hv12_8_msa;
+ c->put_hevc_epel_uni_w[5][1][1] =
+ ff_hevc_put_hevc_uni_w_epel_hv16_8_msa;
+ c->put_hevc_epel_uni_w[6][1][1] =
+ ff_hevc_put_hevc_uni_w_epel_hv24_8_msa;
+ c->put_hevc_epel_uni_w[7][1][1] =
+ ff_hevc_put_hevc_uni_w_epel_hv32_8_msa;
+
+ c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
+ c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
+ c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
+ c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
+ c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_msa;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_msa;
+
+ c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_bi_qpel_h4_8_msa;
+ c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_bi_qpel_h8_8_msa;
+ c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_bi_qpel_h12_8_msa;
+ c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_msa;
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_msa;
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_msa;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_msa;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_msa;
+
+ c->put_hevc_qpel_bi[1][1][0] = ff_hevc_put_hevc_bi_qpel_v4_8_msa;
+ c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_msa;
+ c->put_hevc_qpel_bi[4][1][0] = ff_hevc_put_hevc_bi_qpel_v12_8_msa;
+ c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_msa;
+ c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_msa;
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_msa;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_msa;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_msa;
+
+ c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_bi_qpel_hv4_8_msa;
+ c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_msa;
+ c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_bi_qpel_hv12_8_msa;
+ c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_msa;
+ c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_msa;
+ c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_msa;
+ c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_msa;
+ c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_msa;
+
+ c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
+ c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_msa;
+ c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
+ c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
+ c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
+ c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
+
+ c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_msa;
+ c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_msa;
+ c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_msa;
+ c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_msa;
+ c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_msa;
+ c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_msa;
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_msa;
+
+ c->put_hevc_epel_bi[1][1][0] = ff_hevc_put_hevc_bi_epel_v4_8_msa;
+ c->put_hevc_epel_bi[2][1][0] = ff_hevc_put_hevc_bi_epel_v6_8_msa;
+ c->put_hevc_epel_bi[3][1][0] = ff_hevc_put_hevc_bi_epel_v8_8_msa;
+ c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_msa;
+ c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_msa;
+ c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_msa;
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_msa;
+
+ c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_bi_epel_hv4_8_msa;
+ c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_msa;
+ c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_msa;
+ c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_bi_epel_hv12_8_msa;
+ c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_msa;
+ c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_msa;
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_msa;
+
+ c->put_hevc_qpel_bi_w[1][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
+ c->put_hevc_qpel_bi_w[3][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
+ c->put_hevc_qpel_bi_w[4][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
+ c->put_hevc_qpel_bi_w[5][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
+ c->put_hevc_qpel_bi_w[6][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
+ c->put_hevc_qpel_bi_w[7][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
+ c->put_hevc_qpel_bi_w[8][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels48_8_msa;
+ c->put_hevc_qpel_bi_w[9][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels64_8_msa;
+
+ c->put_hevc_qpel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_qpel_h4_8_msa;
+ c->put_hevc_qpel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_qpel_h8_8_msa;
+ c->put_hevc_qpel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_qpel_h12_8_msa;
+ c->put_hevc_qpel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_qpel_h16_8_msa;
+ c->put_hevc_qpel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_qpel_h24_8_msa;
+ c->put_hevc_qpel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_qpel_h32_8_msa;
+ c->put_hevc_qpel_bi_w[8][0][1] = ff_hevc_put_hevc_bi_w_qpel_h48_8_msa;
+ c->put_hevc_qpel_bi_w[9][0][1] = ff_hevc_put_hevc_bi_w_qpel_h64_8_msa;
+
+ c->put_hevc_qpel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_qpel_v4_8_msa;
+ c->put_hevc_qpel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_qpel_v8_8_msa;
+ c->put_hevc_qpel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_qpel_v12_8_msa;
+ c->put_hevc_qpel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_qpel_v16_8_msa;
+ c->put_hevc_qpel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_qpel_v24_8_msa;
+ c->put_hevc_qpel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_qpel_v32_8_msa;
+ c->put_hevc_qpel_bi_w[8][1][0] = ff_hevc_put_hevc_bi_w_qpel_v48_8_msa;
+ c->put_hevc_qpel_bi_w[9][1][0] = ff_hevc_put_hevc_bi_w_qpel_v64_8_msa;
+
+ c->put_hevc_qpel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv4_8_msa;
+ c->put_hevc_qpel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv8_8_msa;
+ c->put_hevc_qpel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv12_8_msa;
+ c->put_hevc_qpel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv16_8_msa;
+ c->put_hevc_qpel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv24_8_msa;
+ c->put_hevc_qpel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv32_8_msa;
+ c->put_hevc_qpel_bi_w[8][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv48_8_msa;
+ c->put_hevc_qpel_bi_w[9][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv64_8_msa;
+
+ c->put_hevc_epel_bi_w[1][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
+ c->put_hevc_epel_bi_w[2][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels6_8_msa;
+ c->put_hevc_epel_bi_w[3][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
+ c->put_hevc_epel_bi_w[4][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
+ c->put_hevc_epel_bi_w[5][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
+ c->put_hevc_epel_bi_w[6][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
+ c->put_hevc_epel_bi_w[7][0][0] =
+ ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
+
+ c->put_hevc_epel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_epel_h4_8_msa;
+ c->put_hevc_epel_bi_w[2][0][1] = ff_hevc_put_hevc_bi_w_epel_h6_8_msa;
+ c->put_hevc_epel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_epel_h8_8_msa;
+ c->put_hevc_epel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_epel_h12_8_msa;
+ c->put_hevc_epel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_epel_h16_8_msa;
+ c->put_hevc_epel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_epel_h24_8_msa;
+ c->put_hevc_epel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_epel_h32_8_msa;
+
+ c->put_hevc_epel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_epel_v4_8_msa;
+ c->put_hevc_epel_bi_w[2][1][0] = ff_hevc_put_hevc_bi_w_epel_v6_8_msa;
+ c->put_hevc_epel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_epel_v8_8_msa;
+ c->put_hevc_epel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_epel_v12_8_msa;
+ c->put_hevc_epel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_epel_v16_8_msa;
+ c->put_hevc_epel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_epel_v24_8_msa;
+ c->put_hevc_epel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_epel_v32_8_msa;
+
+ c->put_hevc_epel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_epel_hv4_8_msa;
+ c->put_hevc_epel_bi_w[2][1][1] = ff_hevc_put_hevc_bi_w_epel_hv6_8_msa;
+ c->put_hevc_epel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_epel_hv8_8_msa;
+ c->put_hevc_epel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_epel_hv12_8_msa;
+ c->put_hevc_epel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_epel_hv16_8_msa;
+ c->put_hevc_epel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_epel_hv24_8_msa;
+ c->put_hevc_epel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_epel_hv32_8_msa;
+
+ c->sao_band_filter[0] =
+ c->sao_band_filter[1] =
+ c->sao_band_filter[2] =
+ c->sao_band_filter[3] =
+ c->sao_band_filter[4] = ff_hevc_sao_band_filter_0_8_msa;
+
+ c->sao_edge_filter[0] =
+ c->sao_edge_filter[1] =
+ c->sao_edge_filter[2] =
+ c->sao_edge_filter[3] =
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_msa;
+
+ c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_msa;
+ c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_msa;
+
+ c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_msa;
+ c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_msa;
+
+ c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_msa;
+ c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_msa;
+
+ c->hevc_h_loop_filter_chroma_c =
+ ff_hevc_loop_filter_chroma_h_8_msa;
+ c->hevc_v_loop_filter_chroma_c =
+ ff_hevc_loop_filter_chroma_v_8_msa;
+
+ c->idct[0] = ff_hevc_idct_4x4_msa;
+ c->idct[1] = ff_hevc_idct_8x8_msa;
+ c->idct[2] = ff_hevc_idct_16x16_msa;
+ c->idct[3] = ff_hevc_idct_32x32_msa;
+ c->idct_dc[0] = ff_hevc_idct_dc_4x4_msa;
+ c->idct_dc[1] = ff_hevc_idct_dc_8x8_msa;
+ c->idct_dc[2] = ff_hevc_idct_dc_16x16_msa;
+ c->idct_dc[3] = ff_hevc_idct_dc_32x32_msa;
- c->transform_add[0] = ff_hevc_addblk_4x4_msa;
- c->transform_add[1] = ff_hevc_addblk_8x8_msa;
- c->transform_add[2] = ff_hevc_addblk_16x16_msa;
- c->transform_add[3] = ff_hevc_addblk_32x32_msa;
- c->idct_4x4_luma = ff_hevc_idct_luma_4x4_msa;
++ c->add_residual[0] = ff_hevc_addblk_4x4_msa;
++ c->add_residual[1] = ff_hevc_addblk_8x8_msa;
++ c->add_residual[2] = ff_hevc_addblk_16x16_msa;
++ c->add_residual[3] = ff_hevc_addblk_32x32_msa;
++ c->transform_4x4_luma = ff_hevc_idct_luma_4x4_msa;
+ }
+}
+#endif // #if HAVE_MSA
+
+void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth)
+{
+#if HAVE_MSA
+ hevc_dsp_init_msa(c, bit_depth);
+#endif // #if HAVE_MSA
+}
diff --cc libavcodec/x86/hevc_res_add.asm
index dc3e88a,0000000..869288f
mode 100644,000000..100644
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@@ -1,388 -1,0 +1,388 @@@
+; /*
- ; * Provide SIMD optimizations for transform_add functions for HEVC decoding
++; * Provide SIMD optimizations for add_residual functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+cextern pw_1023
+%define max_pixels_10 pw_1023
+
+
+;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
+%macro TR_ADD_MMX_4_8 0
+ mova m2, [r1]
+ mova m4, [r1+8]
+ pxor m3, m3
+ psubw m3, m2
+ packuswb m2, m2
+ packuswb m3, m3
+ pxor m5, m5
+ psubw m5, m4
+ packuswb m4, m4
+ packuswb m5, m5
+
+ movh m0, [r0 ]
+ movh m1, [r0+r2 ]
+ paddusb m0, m2
+ paddusb m1, m4
+ psubusb m0, m3
+ psubusb m1, m5
+ movh [r0 ], m0
+ movh [r0+r2 ], m1
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add4_8, 3, 4, 6
++cglobal hevc_add_residual4_8, 3, 4, 6
+ TR_ADD_MMX_4_8
+ add r1, 16
+ lea r0, [r0+r2*2]
+ TR_ADD_MMX_4_8
+ RET
+
+%macro TR_ADD_SSE_8_8 0
+ pxor m3, m3
+ mova m4, [r1]
+ mova m6, [r1+16]
+ mova m0, [r1+32]
+ mova m2, [r1+48]
+ psubw m5, m3, m4
+ psubw m7, m3, m6
+ psubw m1, m3, m0
+ packuswb m4, m0
+ packuswb m5, m1
+ psubw m3, m2
+ packuswb m6, m2
+ packuswb m7, m3
+
+ movq m0, [r0 ]
+ movq m1, [r0+r2 ]
+ movhps m0, [r0+r2*2]
+ movhps m1, [r0+r3 ]
+ paddusb m0, m4
+ paddusb m1, m6
+ psubusb m0, m5
+ psubusb m1, m7
+ movq [r0 ], m0
+ movq [r0+r2 ], m1
+ movhps [r0+2*r2], m0
+ movhps [r0+r3 ], m1
+%endmacro
+
+%macro TR_ADD_SSE_16_32_8 3
+ mova xm2, [r1+%1 ]
+ mova xm6, [r1+%1+16]
+%if cpuflag(avx2)
+ vinserti128 m2, m2, [r1+%1+32], 1
+ vinserti128 m6, m6, [r1+%1+48], 1
+%endif
+%if cpuflag(avx)
+ psubw m1, m0, m2
+ psubw m5, m0, m6
+%else
+ mova m1, m0
+ mova m5, m0
+ psubw m1, m2
+ psubw m5, m6
+%endif
+ packuswb m2, m6
+ packuswb m1, m5
+
+ mova xm4, [r1+%1+mmsize*2 ]
+ mova xm6, [r1+%1+mmsize*2+16]
+%if cpuflag(avx2)
+ vinserti128 m4, m4, [r1+%1+96 ], 1
+ vinserti128 m6, m6, [r1+%1+112], 1
+%endif
+%if cpuflag(avx)
+ psubw m3, m0, m4
+ psubw m5, m0, m6
+%else
+ mova m3, m0
+ mova m5, m0
+ psubw m3, m4
+ psubw m5, m6
+%endif
+ packuswb m4, m6
+ packuswb m3, m5
+
+ paddusb m2, [%2]
+ paddusb m4, [%3]
+ psubusb m2, m1
+ psubusb m4, m3
+ mova [%2], m2
+ mova [%3], m4
+%endmacro
+
+
+%macro TRANSFORM_ADD_8 0
- ; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add8_8, 3, 4, 8
++; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
++cglobal hevc_add_residual8_8, 3, 4, 8
+ lea r3, [r2*3]
+ TR_ADD_SSE_8_8
+ add r1, 64
+ lea r0, [r0+r2*4]
+ TR_ADD_SSE_8_8
+ RET
+
- ; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add16_8, 3, 4, 7
++; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
++cglobal hevc_add_residual16_8, 3, 4, 7
+ pxor m0, m0
+ lea r3, [r2*3]
+ TR_ADD_SSE_16_32_8 0, r0, r0+r2
+ TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+%rep 3
+ add r1, 128
+ lea r0, [r0+r2*4]
+ TR_ADD_SSE_16_32_8 0, r0, r0+r2
+ TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+%endrep
+ RET
+
- ; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add32_8, 3, 4, 7
++; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
++cglobal hevc_add_residual32_8, 3, 4, 7
+ pxor m0, m0
+ TR_ADD_SSE_16_32_8 0, r0, r0+16
+ TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+%rep 15
+ add r1, 128
+ lea r0, [r0+r2*2]
+ TR_ADD_SSE_16_32_8 0, r0, r0+16
+ TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+%endrep
+ RET
+%endmacro
+
+INIT_XMM sse2
+TRANSFORM_ADD_8
+INIT_XMM avx
+TRANSFORM_ADD_8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
- ; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
- cglobal hevc_transform_add32_8, 3, 4, 7
++; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
++cglobal hevc_add_residual32_8, 3, 4, 7
+ pxor m0, m0
+ lea r3, [r2*3]
+ TR_ADD_SSE_16_32_8 0, r0, r0+r2
+ TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%rep 7
+ add r1, 256
+ lea r0, [r0+r2*4]
+ TR_ADD_SSE_16_32_8 0, r0, r0+r2
+ TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%endrep
+ RET
+%endif
+
+;-----------------------------------------------------------------------------
- ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
++; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro TR_ADD_SSE_8_10 4
+ mova m0, [%4]
+ mova m1, [%4+16]
+ mova m2, [%4+32]
+ mova m3, [%4+48]
+ paddw m0, [%1+0 ]
+ paddw m1, [%1+%2 ]
+ paddw m2, [%1+%2*2]
+ paddw m3, [%1+%3 ]
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+ CLIPW m2, m4, m5
+ CLIPW m3, m4, m5
+ mova [%1+0 ], m0
+ mova [%1+%2 ], m1
+ mova [%1+%2*2], m2
+ mova [%1+%3 ], m3
+%endmacro
+
+%macro TR_ADD_MMX4_10 3
+ mova m0, [%1+0 ]
+ mova m1, [%1+%2 ]
+ paddw m0, [%3]
+ paddw m1, [%3+8]
+ CLIPW m0, m2, m3
+ CLIPW m1, m2, m3
+ mova [%1+0 ], m0
+ mova [%1+%2 ], m1
+%endmacro
+
+%macro TRANS_ADD_SSE_16_10 3
+ mova m0, [%3]
+ mova m1, [%3+16]
+ mova m2, [%3+32]
+ mova m3, [%3+48]
+ paddw m0, [%1 ]
+ paddw m1, [%1+16 ]
+ paddw m2, [%1+%2 ]
+ paddw m3, [%1+%2+16]
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+ CLIPW m2, m4, m5
+ CLIPW m3, m4, m5
+ mova [%1 ], m0
+ mova [%1+16 ], m1
+ mova [%1+%2 ], m2
+ mova [%1+%2+16], m3
+%endmacro
+
+%macro TRANS_ADD_SSE_32_10 2
+ mova m0, [%2]
+ mova m1, [%2+16]
+ mova m2, [%2+32]
+ mova m3, [%2+48]
+
+ paddw m0, [%1 ]
+ paddw m1, [%1+16]
+ paddw m2, [%1+32]
+ paddw m3, [%1+48]
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+ CLIPW m2, m4, m5
+ CLIPW m3, m4, m5
+ mova [%1 ], m0
+ mova [%1+16], m1
+ mova [%1+32], m2
+ mova [%1+48], m3
+%endmacro
+
+%macro TRANS_ADD16_AVX2 4
+ mova m0, [%4]
+ mova m1, [%4+32]
+ mova m2, [%4+64]
+ mova m3, [%4+96]
+
+ paddw m0, [%1+0 ]
+ paddw m1, [%1+%2 ]
+ paddw m2, [%1+%2*2]
+ paddw m3, [%1+%3 ]
+
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+ CLIPW m2, m4, m5
+ CLIPW m3, m4, m5
+ mova [%1+0 ], m0
+ mova [%1+%2 ], m1
+ mova [%1+%2*2], m2
+ mova [%1+%3 ], m3
+%endmacro
+
+%macro TRANS_ADD32_AVX2 3
+ mova m0, [%3]
+ mova m1, [%3+32]
+ mova m2, [%3+64]
+ mova m3, [%3+96]
+
+ paddw m0, [%1 ]
+ paddw m1, [%1+32 ]
+ paddw m2, [%1+%2 ]
+ paddw m3, [%1+%2+32]
+
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+ CLIPW m2, m4, m5
+ CLIPW m3, m4, m5
+ mova [%1 ], m0
+ mova [%1+32 ], m1
+ mova [%1+%2 ], m2
+ mova [%1+%2+32], m3
+%endmacro
+
+
+INIT_MMX mmxext
- cglobal hevc_transform_add4_10,3,4, 6
++cglobal hevc_add_residual4_10,3,4, 6
+ pxor m2, m2
+ mova m3, [max_pixels_10]
+ TR_ADD_MMX4_10 r0, r2, r1
+ add r1, 16
+ lea r0, [r0+2*r2]
+ TR_ADD_MMX4_10 r0, r2, r1
+ RET
+
+;-----------------------------------------------------------------------------
- ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
++; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
- cglobal hevc_transform_add8_10,3,4,6
++cglobal hevc_add_residual8_10,3,4,6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
+ lea r3, [r2*3]
+
+ TR_ADD_SSE_8_10 r0, r2, r3, r1
+ lea r0, [r0+r2*4]
+ add r1, 64
+ TR_ADD_SSE_8_10 r0, r2, r3, r1
+ RET
+
- cglobal hevc_transform_add16_10,3,4,6
++cglobal hevc_add_residual16_10,3,4,6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
+
+ TRANS_ADD_SSE_16_10 r0, r2, r1
+%rep 7
+ lea r0, [r0+r2*2]
+ add r1, 64
+ TRANS_ADD_SSE_16_10 r0, r2, r1
+%endrep
+ RET
+
- cglobal hevc_transform_add32_10,3,4,6
++cglobal hevc_add_residual32_10,3,4,6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
+
+ TRANS_ADD_SSE_32_10 r0, r1
+%rep 31
+ lea r0, [r0+r2]
+ add r1, 64
+ TRANS_ADD_SSE_32_10 r0, r1
+%endrep
+ RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
- cglobal hevc_transform_add16_10,3,4,6
++cglobal hevc_add_residual16_10,3,4,6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
+ lea r3, [r2*3]
+
+ TRANS_ADD16_AVX2 r0, r2, r3, r1
+%rep 3
+ lea r0, [r0+r2*4]
+ add r1, 128
+ TRANS_ADD16_AVX2 r0, r2, r3, r1
+%endrep
+ RET
+
- cglobal hevc_transform_add32_10,3,4,6
++cglobal hevc_add_residual32_10,3,4,6
+ pxor m4, m4
+ mova m5, [max_pixels_10]
+
+ TRANS_ADD32_AVX2 r0, r2, r1
+%rep 15
+ lea r0, [r0+r2*2]
+ add r1, 128
+ TRANS_ADD32_AVX2 r0, r2, r1
+%endrep
+ RET
+%endif ;HAVE_AVX_EXTERNAL
diff --cc libavcodec/x86/hevcdsp.h
index ad8168f,0000000..3cfdc27
mode 100644,000000..100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@@ -1,261 -1,0 +1,261 @@@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define idct_dc_proto(size, bitd, opt) \
+ void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+ PEL_PROTOTYPE(fname##4, bitd, opt); \
+ PEL_PROTOTYPE(fname##6, bitd, opt); \
+ PEL_PROTOTYPE(fname##8, bitd, opt); \
+ PEL_PROTOTYPE(fname##12, bitd, opt); \
+ PEL_PROTOTYPE(fname##16, bitd, opt); \
+ PEL_PROTOTYPE(fname##24, bitd, opt); \
+ PEL_PROTOTYPE(fname##32, bitd, opt); \
+ PEL_PROTOTYPE(fname##48, bitd, opt); \
+ PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+ PEL_PROTOTYPE(fname##4, bitd, opt); \
+ PEL_PROTOTYPE(fname##8, bitd, opt); \
+ PEL_PROTOTYPE(fname##12, bitd, opt); \
+ PEL_PROTOTYPE(fname##16, bitd, opt); \
+ PEL_PROTOTYPE(fname##24, bitd, opt); \
+ PEL_PROTOTYPE(fname##32, bitd, opt); \
+ PEL_PROTOTYPE(fname##48, bitd, opt); \
+ PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+ WEIGHTING_PROTOTYPE(2, bitd, opt); \
+ WEIGHTING_PROTOTYPE(4, bitd, opt); \
+ WEIGHTING_PROTOTYPE(6, bitd, opt); \
+ WEIGHTING_PROTOTYPE(8, bitd, opt); \
+ WEIGHTING_PROTOTYPE(12, bitd, opt); \
+ WEIGHTING_PROTOTYPE(16, bitd, opt); \
+ WEIGHTING_PROTOTYPE(24, bitd, opt); \
+ WEIGHTING_PROTOTYPE(32, bitd, opt); \
+ WEIGHTING_PROTOTYPE(48, bitd, opt); \
+ WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels , 8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h , 8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v , 8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv , 8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h , 8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v, 8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv, 8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
- void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
- void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
- void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
- void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
- void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
- void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
++void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
diff --cc libavcodec/x86/hevcdsp_init.c
index 09eb06d,fd22fc3..da73d76
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@@ -696,419 -207,102 +696,419 @@@ void ff_hevc_dsp_init_x86(HEVCDSPContex
{
int cpu_flags = av_get_cpu_flags();
-#define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \
- c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \
- c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \
- c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
- c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
- c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
- c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
- c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
- c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
-
-#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \
- c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \
- c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \
- c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
- c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
- c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
- c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
-
-#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf)
-#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
-
if (bit_depth == 8) {
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
+ c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
- c->transform_add[0] = ff_hevc_transform_add4_8_mmxext;
++ c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
+ }
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
+
+ }
+ SAO_BAND_INIT(8, sse2);
- SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
- SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
+ c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
+ c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
+ c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
- c->transform_add[1] = ff_hevc_transform_add8_8_sse2;
- c->transform_add[2] = ff_hevc_transform_add16_8_sse2;
- c->transform_add[3] = ff_hevc_transform_add32_8_sse2;
- SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 8, sse2);
- SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 8, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 8, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
++ c->add_residual[1] = ff_hevc_add_residual8_8_sse2;
++ c->add_residual[2] = ff_hevc_add_residual16_8_sse2;
++ c->add_residual[3] = ff_hevc_add_residual32_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
- SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
- SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
- SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
- SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
+ if(ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+ }
+ SAO_EDGE_INIT(8, ssse3);
+ }
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
+
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
+ }
+ if (EXTERNAL_AVX(cpu_flags)) {
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+ }
+ SAO_BAND_INIT(8, avx);
+
- c->transform_add[1] = ff_hevc_transform_add8_8_avx;
- c->transform_add[2] = ff_hevc_transform_add16_8_avx;
- c->transform_add[3] = ff_hevc_transform_add32_8_avx;
++ c->add_residual[1] = ff_hevc_add_residual8_8_avx;
++ c->add_residual[2] = ff_hevc_add_residual16_8_avx;
++ c->add_residual[3] = ff_hevc_add_residual32_8_avx;
+ }
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
+ c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
+ c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
+ if (ARCH_X86_64) {
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+ c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+ c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+ c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+ c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+ c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+ c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+ c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+ c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+ c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+ c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+ c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+ c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+ c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+ c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+ c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+ c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
+ }
+ SAO_BAND_INIT(8, avx2);
+
+ c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
+ c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
+
- c->transform_add[3] = ff_hevc_transform_add32_8_avx2;
++ c->add_residual[3] = ff_hevc_add_residual32_8_avx2;
}
} else if (bit_depth == 10) {
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
++ c->add_residual[0] = ff_hevc_add_residual4_10_mmxext;
+ c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
+ c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
+ }
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
+ }
+ SAO_BAND_INIT(10, sse2);
+ SAO_EDGE_INIT(10, sse2);
- SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
- SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
+ c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
+ c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
+ c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
- c->transform_add[1] = ff_hevc_transform_add8_10_sse2;
- c->transform_add[2] = ff_hevc_transform_add16_10_sse2;
- c->transform_add[3] = ff_hevc_transform_add32_10_sse2;
- SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 10, sse2);
- SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2);
- SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
++ c->add_residual[1] = ff_hevc_add_residual8_10_sse2;
++ c->add_residual[2] = ff_hevc_add_residual16_10_sse2;
++ c->add_residual[3] = ff_hevc_add_residual32_10_sse2;
}
- }
+ if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+ }
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
-#if ARCH_X86_64
- if (bit_depth == 8) {
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
+ }
+ if (EXTERNAL_AVX(cpu_flags)) {
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+ }
+ SAO_BAND_INIT(10, avx);
}
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
+ c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
+ if (ARCH_X86_64) {
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+ c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+ c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+ c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+ c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+ c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+ c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+ c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+ c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+ c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+ c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+ c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+ c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+ c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+ c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+ c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+ c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+ c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+ c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+ c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+ c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+ c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+ c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+ c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+ c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+ c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+ c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+ c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+ c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+ c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+ c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+ c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+ c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+ c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+ c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+ c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+ c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+ c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+ c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+ c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+ c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+ c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+ c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+ c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
- if (EXTERNAL_SSE4(cpu_flags)) {
- SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 8, sse4);
- SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 8, sse4);
- SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 8, sse4);
- SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
+ c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+ c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+ c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+ c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+ c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+ c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+ c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+ c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+ c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+ c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+ c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+ c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+ c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+ c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+ c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+ c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+ c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+ c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+ c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+ c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+ c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+ c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
+ }
+ SAO_BAND_INIT(10, avx2);
+ SAO_EDGE_INIT(10, avx2);
+
- c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
- c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
++ c->add_residual[2] = ff_hevc_add_residual16_10_avx2;
++ c->add_residual[3] = ff_hevc_add_residual32_10_avx2;
+
+ }
+ } else if (bit_depth == 12) {
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
+ c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
}
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+ }
+ SAO_BAND_INIT(12, sse2);
+ SAO_EDGE_INIT(12, sse2);
- if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
- SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
- SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
+ c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
+ c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
+ c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
}
- } else if (bit_depth == 10) {
- if (EXTERNAL_SSSE3(cpu_flags)) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+ if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
}
- if (EXTERNAL_SSE4(cpu_flags)) {
- SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 10, sse4);
- SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 10, sse4);
- SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 10, sse4);
- SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
+ if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+ EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
+ EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
+
+ QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
+ QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
}
if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
- SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
- SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
- SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
- SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
- SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
- SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
+ if (ARCH_X86_64) {
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
+ }
+ SAO_BAND_INIT(12, avx);
+ }
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
+ c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
+
+ SAO_BAND_INIT(12, avx2);
+ SAO_EDGE_INIT(12, avx2);
}
}
-#endif /* ARCH_X86_64 */
}
More information about the ffmpeg-cvslog
mailing list