[FFmpeg-devel] [PATCH] avcodec/x86/hevc: fix luma 12b overflow
J. Dekker
jdek at itanimul.li
Sun Feb 25 10:27:55 EET 2024
Weak filter can overflow in delta0 calculation before >> 4 in int16.
Signed-off-by: J. Dekker <jdek at itanimul.li>
---
I do not know x86 simd at all, so this is just an attempt to fix
the implementation rather than write extremely performant code.
Suggestions welcome.
libavcodec/x86/hevc_deblock.asm | 47 +++++++++++++++++++++++++++++++++
libavcodec/x86/hevcdsp_init.c | 8 ------
2 files changed, 47 insertions(+), 8 deletions(-)
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 85ee4800bb..ce9221ebc7 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -541,6 +541,7 @@ ALIGN 16
add betaq, r13
shr betaq, 3; ((beta + (beta >> 1)) >> 3))
+%if %1 < 12
mova m13, [pw_8]
psubw m12, m4, m3 ; q0 - p0
psllw m10, m12, 3; 8 * (q0 - p0)
@@ -553,7 +554,49 @@ ALIGN 16
paddw m12, m13; + 8
psraw m12, 4; >> 4 , delta0
PABSW m13, m12; abs(delta0)
+%else
+ psubw m12, m4, m3 ; q0 - p0
+ pmovsxwd m13, m12 ; m13 low
+ movhlps m12, m12
+ pmovsxwd m12, m12 ; m12 high
+
+ ; m8 low, m10 high
+ pslld m8, m13, 3; 8 * (q0 - p0)
+ pslld m10, m12, 3
+
+ paddd m8, m13 ; 9 * (q0 - p0)
+ paddd m10, m12
+
+ psubw m12, m5, m2 ; q1 - p1
+ pmovsxwd m13, m12 ; m13 low
+ movhlps m12, m12
+ pmovsxwd m12, m12 ; m12 high
+ psubd m8, m13 ; 9 * (q0 - p0) - ( q1 - p1 )
+ psubd m10, m12
+
+ pslld m13, m13, 1; 2 * ( q1 - p1 )
+ pslld m12, m12, 1
+
+ psubd m8, m13; 9 * (q0 - p0) - 3 * ( q1 - p1 )
+ psubd m10, m12
+
+ mova m13, [pw_8]
+ pmovsxwd m13, m13
+
+ paddd m8, m13 ; + 8
+ paddd m10, m13
+
+ psrad m8, 4; >> 4 , delta0
+ psrad m10, 4
+
+ packssdw m12, m8
+ packssdw m10, m10
+
+ psrldq m12, 8
+ punpcklqdq m12, m10
+ PABSW m13, m12; abs(delta0)
+%endif
psllw m10, m9, 2; 8 * tc
paddw m10, m9; 10 * tc
@@ -746,6 +789,7 @@ cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3
.bypassluma:
RET
+%if cpuflag(avx)
cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
lea pix0q, [3 * strideq]
@@ -757,6 +801,7 @@ cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3
TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
.bypassluma:
RET
+%endif
;-----------------------------------------------------------------------------
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
@@ -829,6 +874,7 @@ cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3
.bypassluma:
RET
+%if cpuflag(avx)
cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
@@ -859,6 +905,7 @@ cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3
movdqu [pixq + 2 * strideq], m6; q2
.bypassluma:
RET
+%endif
%endmacro
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index f5bc342cd5..e3fcb7b591 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -1205,10 +1205,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
- if (ARCH_X86_64) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
- }
SAO_BAND_INIT(12, sse2);
SAO_EDGE_INIT(12, sse2);
@@ -1216,10 +1212,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
}
- if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
- c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
- c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
- }
if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
--
2.43.2
More information about the ffmpeg-devel
mailing list