[FFmpeg-devel] [PATCH] avcodec/mips: Improve avc bi-weighted mc msa functions
Manojkumar Bhosale
Manojkumar.Bhosale at imgtec.com
Mon Oct 9 11:56:52 EEST 2017
LGTM
-----Original Message-----
From: ffmpeg-devel [mailto:ffmpeg-devel-bounces at ffmpeg.org] On Behalf Of kaustubh.raste at imgtec.com
Sent: Monday, October 9, 2017 12:49 PM
To: ffmpeg-devel at ffmpeg.org
Cc: Kaustubh Raste
Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc bi-weighted mc msa functions
From: Kaustubh Raste <kaustubh.raste at imgtec.com>
Replace generic with block size specific function.
Signed-off-by: Kaustubh Raste <kaustubh.raste at imgtec.com>
---
libavcodec/mips/h264dsp_msa.c | 469 +++++++++++++++++++++++------------
libavutil/mips/generic_macros_msa.h | 4 +
2 files changed, 311 insertions(+), 162 deletions(-)
diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c index 5b06bd9..e50f5ca 100644
--- a/libavcodec/mips/h264dsp_msa.c
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -223,217 +223,242 @@ static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
}
}
-static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
+static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t
+stride,
int32_t log2_denom, int32_t src_weight,
int32_t dst_weight, int32_t offset_in) {
- uint32_t load0, load1, out0, out1;
- v16i8 src_wgt, dst_wgt, wgt;
- v16i8 src0, src1, dst0, dst1;
- v8i16 temp0, temp1, denom, offset, add_val;
- int32_t val = 128 * (src_weight + dst_weight);
+ uint32_t tp0, tp1;
+ v16i8 src_wgt, dst_wgt, wgt, vec0;
+ v16u8 src0 = { 0 }, dst0 = { 0 };
+ v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
- offset_in = ((offset_in + 1) | 1) << log2_denom;
+ offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+ offset_in += (128 * (src_weight + dst_weight));
src_wgt = __msa_fill_b(src_weight);
dst_wgt = __msa_fill_b(dst_weight);
offset = __msa_fill_h(offset_in);
denom = __msa_fill_h(log2_denom + 1);
- add_val = __msa_fill_h(val);
- offset += add_val;
wgt = __msa_ilvev_b(dst_wgt, src_wgt);
- load0 = LW(src);
- src += src_stride;
- load1 = LW(src);
-
- src0 = (v16i8) __msa_fill_w(load0);
- src1 = (v16i8) __msa_fill_w(load1);
-
- load0 = LW(dst);
- load1 = LW(dst + dst_stride);
-
- dst0 = (v16i8) __msa_fill_w(load0);
- dst1 = (v16i8) __msa_fill_w(load1);
+ LW2(src, stride, tp0, tp1);
+ INSERT_W2_UB(tp0, tp1, src0);
+ LW2(dst, stride, tp0, tp1);
+ INSERT_W2_UB(tp0, tp1, dst0);
+ XORI_B2_128_UB(src0, dst0);
+ vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
+ tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+ tmp0 >>= denom;
+ tmp0 = __msa_maxi_s_h(tmp0, 0);
+ tmp0 = __msa_min_s_h(max255, tmp0);
+ dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+ ST4x2_UB(dst0, dst, stride);
+}
- XORI_B4_128_SB(src0, src1, dst0, dst1);
- ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1);
+static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+ int32_t log2_denom, int32_t src_weight,
+ int32_t dst_weight, int32_t offset_in) {
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
+ v16u8 src0, dst0;
+ v8i16 tmp0, tmp1, denom, offset;
- temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
- temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
+ offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+ offset_in += (128 * (src_weight + dst_weight));
- temp0 >>= denom;
- temp1 >>= denom;
+ src_wgt = __msa_fill_b(src_weight);
+ dst_wgt = __msa_fill_b(dst_weight);
+ offset = __msa_fill_h(offset_in);
+ denom = __msa_fill_h(log2_denom + 1);
- CLIP_SH2_0_255(temp0, temp1);
- PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1);
+ wgt = __msa_ilvev_b(dst_wgt, src_wgt);
- out0 = __msa_copy_u_w((v4i32) dst0, 0);
- out1 = __msa_copy_u_w((v4i32) dst1, 0);
- SW(out0, dst);
- dst += dst_stride;
- SW(out1, dst);
+ LW4(src, stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+ LW4(dst, stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ XORI_B2_128_UB(src0, dst0);
+ ILVRL_B2_SB(dst0, src0, vec0, vec1);
+ tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+ tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+ tmp0 >>= denom;
+ tmp1 >>= denom;
+ CLIP_SH2_0_255(tmp0, tmp1);
+ dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+ ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
}
-static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int32_t height, int32_t log2_denom,
- int32_t src_weight, int32_t dst_weight,
- int32_t offset_in)
+static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+ int32_t log2_denom, int32_t src_weight,
+ int32_t dst_weight, int32_t offset_in)
{
- uint8_t cnt;
- uint32_t load0, load1, load2, load3;
- v16i8 src_wgt, dst_wgt, wgt;
- v16i8 src0, src1, src2, src3;
- v16i8 dst0, dst1, dst2, dst3;
- v8i16 temp0, temp1, temp2, temp3;
- v8i16 denom, offset, add_val;
- int32_t val = 128 * (src_weight + dst_weight);
+ uint32_t tp0, tp1, tp2, tp3;
+ v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
+ v16u8 src0, src1, dst0, dst1;
+ v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
- offset_in = ((offset_in + 1) | 1) << log2_denom;
+ offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+ offset_in += (128 * (src_weight + dst_weight));
src_wgt = __msa_fill_b(src_weight);
dst_wgt = __msa_fill_b(dst_weight);
offset = __msa_fill_h(offset_in);
denom = __msa_fill_h(log2_denom + 1);
- add_val = __msa_fill_h(val);
- offset += add_val;
-
wgt = __msa_ilvev_b(dst_wgt, src_wgt);
- for (cnt = height / 4; cnt--;) {
- LW4(src, src_stride, load0, load1, load2, load3);
- src += (4 * src_stride);
-
- src0 = (v16i8) __msa_fill_w(load0);
- src1 = (v16i8) __msa_fill_w(load1);
- src2 = (v16i8) __msa_fill_w(load2);
- src3 = (v16i8) __msa_fill_w(load3);
-
- LW4(dst, dst_stride, load0, load1, load2, load3);
+ LW4(src, stride, tp0, tp1, tp2, tp3);
+ src += 4 * stride;
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+ LW4(src, stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
+ LW4(dst, stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+ LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+ INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
+ XORI_B4_128_UB(src0, src1, dst0, dst1);
+ ILVRL_B2_SB(dst0, src0, vec0, vec1);
+ ILVRL_B2_SB(dst1, src1, vec2, vec3);
+ tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+ tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+ tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+ tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+ CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+ ST4x8_UB(dst0, dst1, dst, stride);
+}
- dst0 = (v16i8) __msa_fill_w(load0);
- dst1 = (v16i8) __msa_fill_w(load1);
- dst2 = (v16i8) __msa_fill_w(load2);
- dst3 = (v16i8) __msa_fill_w(load3);
+static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+ int32_t log2_denom, int32_t src_weight,
+ int32_t dst_weight, int32_t offset_in) {
+ uint64_t tp0, tp1, tp2, tp3;
+ v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
+ v16u8 src0, src1, dst0, dst1;
+ v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
- XORI_B4_128_SB(src0, src1, src2, src3);
- XORI_B4_128_SB(dst0, dst1, dst2, dst3);
- ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
- temp0, temp1, temp2, temp3);
+ offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+ offset_in += (128 * (src_weight + dst_weight));
- temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
- temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
- temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
- temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
+ src_wgt = __msa_fill_b(src_weight);
+ dst_wgt = __msa_fill_b(dst_weight);
+ offset = __msa_fill_h(offset_in);
+ denom = __msa_fill_h(log2_denom + 1);
- SRA_4V(temp0, temp1, temp2, temp3, denom);
- CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
- PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
+ wgt = __msa_ilvev_b(dst_wgt, src_wgt);
-static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int32_t height, int32_t log2_denom,
- int32_t src_weight, int32_t dst_weight,
- int32_t offset_in)
-{
- if (2 == height) {
- avc_biwgt_4x2_msa(src, src_stride, dst, dst_stride, log2_denom,
- src_weight, dst_weight, offset_in);
- } else {
- avc_biwgt_4x4multiple_msa(src, src_stride, dst, dst_stride, height,
- log2_denom, src_weight, dst_weight,
- offset_in);
- }
+ LD4(src, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, src0);
+ INSERT_D2_UB(tp2, tp3, src1);
+ LD4(dst, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ XORI_B4_128_UB(src0, src1, dst0, dst1);
+ ILVRL_B2_SB(dst0, src0, vec0, vec1);
+ ILVRL_B2_SB(dst1, src1, vec2, vec3);
+ tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+ tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+ tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+ tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+ CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+ ST8x4_UB(dst0, dst1, dst, stride);
}
-static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int32_t height, int32_t log2_denom,
- int32_t src_weight, int32_t dst_weight,
- int32_t offset_in)
+static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+ int32_t log2_denom, int32_t src_weight,
+ int32_t dst_weight, int32_t offset_in)
{
- uint8_t cnt;
- v16i8 src_wgt, dst_wgt, wgt;
- v16i8 src0, src1, src2, src3;
- v16i8 dst0, dst1, dst2, dst3;
- v16i8 out0, out1;
- v8i16 temp0, temp1, temp2, temp3;
- v8i16 denom, offset, add_val;
- int32_t val = 128 * (src_weight + dst_weight);
+ uint64_t tp0, tp1, tp2, tp3;
+ v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom,
+ offset;
- offset_in = ((offset_in + 1) | 1) << log2_denom;
+ offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+ offset_in += (128 * (src_weight + dst_weight));
src_wgt = __msa_fill_b(src_weight);
dst_wgt = __msa_fill_b(dst_weight);
offset = __msa_fill_h(offset_in);
denom = __msa_fill_h(log2_denom + 1);
- add_val = __msa_fill_h(val);
- offset += add_val;
-
wgt = __msa_ilvev_b(dst_wgt, src_wgt);
- for (cnt = height / 4; cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- XORI_B4_128_SB(dst0, dst1, dst2, dst3);
- ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
- temp0, temp1, temp2, temp3);
-
- temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
- temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
- temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
- temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
-
- SRA_4V(temp0, temp1, temp2, temp3, denom);
- CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
- PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += 4 * dst_stride;
- }
+ LD4(src, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, src0);
+ INSERT_D2_UB(tp2, tp3, src1);
+ LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, src2);
+ INSERT_D2_UB(tp2, tp3, src3);
+ LD4(dst, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst2);
+ INSERT_D2_UB(tp2, tp3, dst3);
+ XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
+ ILVRL_B2_SB(dst0, src0, vec0, vec1);
+ ILVRL_B2_SB(dst1, src1, vec2, vec3);
+ ILVRL_B2_SB(dst2, src2, vec4, vec5);
+ ILVRL_B2_SB(dst3, src3, vec6, vec7);
+ tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+ tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+ tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+ tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+ tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+ tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+ tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+ tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
+ CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+ CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+ PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
+ ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
}
-static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int32_t height, int32_t log2_denom,
- int32_t src_weight, int32_t dst_weight,
- int32_t offset_in)
+static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
+ int32_t log2_denom, int32_t src_weight,
+ int32_t dst_weight, int32_t offset_in)
{
uint8_t cnt;
+ uint64_t tp0, tp1, tp2, tp3;
v16i8 src_wgt, dst_wgt, wgt;
- v16i8 src0, src1, src2, src3;
- v16i8 dst0, dst1, dst2, dst3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
- v8i16 denom, offset, add_val;
- int32_t val = 128 * (src_weight + dst_weight);
+ v8i16 denom, offset;
- offset_in = ((offset_in + 1) | 1) << log2_denom;
+ offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+ offset_in += (128 * (src_weight + dst_weight));
src_wgt = __msa_fill_b(src_weight);
dst_wgt = __msa_fill_b(dst_weight);
offset = __msa_fill_h(offset_in);
denom = __msa_fill_h(log2_denom + 1);
- add_val = __msa_fill_h(val);
- offset += add_val;
-
wgt = __msa_ilvev_b(dst_wgt, src_wgt);
- for (cnt = height / 4; cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+ for (cnt = 2; cnt--;) {
+ LD4(src, stride, tp0, tp1, tp2, tp3);
+ src += 4 * stride;
+ INSERT_D2_UB(tp0, tp1, src0);
+ INSERT_D2_UB(tp2, tp3, src1);
+ LD4(src, stride, tp0, tp1, tp2, tp3);
+ src += 4 * stride;
+ INSERT_D2_UB(tp0, tp1, src2);
+ INSERT_D2_UB(tp2, tp3, src3);
+ LD4(dst, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst0);
+ INSERT_D2_UB(tp2, tp3, dst1);
+ LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
+ INSERT_D2_UB(tp0, tp1, dst2);
+ INSERT_D2_UB(tp2, tp3, dst3);
+ XORI_B4_128_UB(src0, src1, src2, src3);
+ XORI_B4_128_UB(dst0, dst1, dst2, dst3);
ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
vec0, vec2, vec4, vec6);
ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, @@ -452,10 +477,10 @@ static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride,
SRA_4V(temp4, temp5, temp6, temp7, denom);
CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
- PCKEV_B4_SB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+ PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7,
+ temp6,
dst0, dst1, dst2, dst3);
- ST_SB4(dst0, dst1, dst2, dst3, dst, dst_stride);
- dst += 4 * dst_stride;
+ ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ dst += 8 * stride;
}
}
@@ -2430,10 +2455,114 @@ void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
ptrdiff_t stride, int height,
int log2_denom, int weight_dst,
- int weight_src, int offset)
+ int weight_src, int offset_in)
{
- avc_biwgt_16width_msa(src, stride, dst, stride, height, log2_denom,
- weight_src, weight_dst, offset);
+ v16i8 src_wgt, dst_wgt, wgt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ v8i16 denom, offset;
+
+ offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
+ offset_in += (128 * (weight_src + weight_dst));
+
+ src_wgt = __msa_fill_b(weight_src);
+ dst_wgt = __msa_fill_b(weight_dst);
+ offset = __msa_fill_h(offset_in);
+ denom = __msa_fill_h(log2_denom + 1);
+
+ wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+ LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += 8 * stride;
+ LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
+ XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
+ vec6);
+ ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
+ vec7);
+ ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
+ vec12, vec14);
+ ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
+ vec13, vec15);
+ tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+ tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+ tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+ tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+ tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+ tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+ tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+ tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+ tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
+ tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
+ tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
+ tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
+ tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
+ tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
+ tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
+ tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
+ SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
+ SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
+ CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+ CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+ CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
+ CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
+ PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
+ dst2, dst3);
+ PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
+ dst5, dst6, dst7);
+ ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
+ dst += 8 * stride;
+
+ if (16 == height) {
+ LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
+ XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
+ vec4, vec6);
+ ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
+ vec5, vec7);
+ ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
+ vec12, vec14);
+ ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
+ vec13, vec15);
+ tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+ tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+ tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+ tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+ tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+ tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+ tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+ tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+ tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
+ tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
+ tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
+ tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
+ tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
+ tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
+ tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
+ tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
+ SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
+ SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
+ CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
+ CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
+ CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11);
+ CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15);
+ PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
+ dst2, dst3);
+ PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
+ dst5, dst6, dst7);
+ ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
+ }
}
void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src, @@ -2441,8 +2570,16 @@ void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
int log2_denom, int weight_dst,
int weight_src, int offset) {
- avc_biwgt_8width_msa(src, stride, dst, stride, height, log2_denom,
- weight_src, weight_dst, offset);
+ if (4 == height) {
+ avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+ offset);
+ } else if (8 == height) {
+ avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+ offset);
+ } else {
+ avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+ offset);
+ }
}
void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src, @@ -2450,6 +2587,14 @@ void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
int log2_denom, int weight_dst,
int weight_src, int offset) {
- avc_biwgt_4width_msa(src, stride, dst, stride, height, log2_denom,
- weight_src, weight_dst, offset);
+ if (2 == height) {
+ avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+ offset);
+ } else if (4 == height) {
+ avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+ offset);
+ } else {
+ avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
+ offset);
+ }
}
diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index 7de97dd..c892529 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -1252,6 +1252,7 @@
}
#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+#define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
/* Description : Insert specified double word elements from input vectors to 1 @@ -1267,6 +1268,7 @@ } #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
#define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
/* Description : Interleave even byte elements from vectors @@ -1444,6 +1446,7 @@
out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
}
#define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
+#define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
#define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__) #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
@@ -1974,6 +1977,7 @@
XORI_B4_128(RTYPE, in4, in5, in6, in7); \
}
#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
+#define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
/* Description : Addition of signed halfword elements and signed saturation
Arguments : Inputs - in0, in1, in2, in3
--
1.7.9.5
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel at ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
More information about the ffmpeg-devel
mailing list