[FFmpeg-devel] [PATCH v3 4/7] swscale/x86/range_convert: update sse2 and avx2 range_convert functions to new API
Ramiro Polla
ramiro.polla at gmail.com
Sat Nov 30 17:23:39 EET 2024
chrRangeFromJpeg8_1920_c: 5804.5 ( 1.00x)
chrRangeFromJpeg8_1920_sse2: 1960.2 ( 2.96x) 1955.2 ( 2.97x)
chrRangeFromJpeg8_1920_avx2: 996.1 ( 5.83x) 988.9 ( 5.87x)
chrRangeToJpeg8_1920_c: 9388.6 ( 1.00x)
chrRangeToJpeg8_1920_sse2: 1963.7 ( 4.78x) 1949.9 ( 4.81x)
chrRangeToJpeg8_1920_avx2: 984.0 ( 9.54x) 988.5 ( 9.50x)
lumRangeFromJpeg8_1920_c: 4147.9 ( 1.00x)
lumRangeFromJpeg8_1920_sse2: 1032.0 ( 4.02x) 1040.5 ( 3.99x)
lumRangeFromJpeg8_1920_avx2: 575.2 ( 7.21x) 520.5 ( 7.97x)
lumRangeToJpeg8_1920_c: 5694.1 ( 1.00x)
lumRangeToJpeg8_1920_sse2: 1035.9 ( 5.50x) 1046.0 ( 5.44x)
lumRangeToJpeg8_1920_avx2: 513.5 (11.09x) 540.5 (10.53x)
---
libswscale/x86/range_convert.asm | 86 ++++++++++++++++----------------
libswscale/x86/swscale.c | 17 +++----
2 files changed, 50 insertions(+), 53 deletions(-)
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
index ffda009c4e..27be2a4b31 100644
--- a/libswscale/x86/range_convert.asm
+++ b/libswscale/x86/range_convert.asm
@@ -20,39 +20,29 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-
-chr_to_mult: times 4 dw 4663, 0
-chr_to_offset: times 4 dd -9289992
-%define chr_to_shift 12
-
-chr_from_mult: times 4 dw 1799, 0
-chr_from_offset: times 4 dd 4081085
-%define chr_from_shift 11
-
-lum_to_mult: times 4 dw 19077, 0
-lum_to_offset: times 4 dd -39057361
-%define lum_to_shift 14
-
-lum_from_mult: times 4 dw 14071, 0
-lum_from_offset: times 4 dd 33561947
-%define lum_from_shift 14
-
SECTION .text
;-----------------------------------------------------------------------------
; lumConvertRange
;
-; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
-; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
+; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width,
+; uint32_t coeff, int64_t offset);
+; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width,
+; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
-%macro LUMCONVERTRANGE 4
-cglobal %1, 2, 2, 5, dst, width
+%macro LUMCONVERTRANGE 1
+cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
shl widthd, 1
- VBROADCASTI128 m2, [%2]
- VBROADCASTI128 m3, [%3]
+ movd xm2, coeffd
+ VBROADCASTSS m2, xm2
+%if ARCH_X86_64
+ movq xm3, offsetq
+%else
+ movq xm3, offsetm
+%endif
+ VBROADCASTSS m3, xm3
pxor m4, m4
add dstq, widthq
neg widthq
@@ -64,8 +54,8 @@ cglobal %1, 2, 2, 5, dst, width
pmaddwd m1, m2
paddd m0, m3
paddd m1, m3
- psrad m0, %4
- psrad m1, %4
+ psrad m0, 14
+ psrad m1, 14
packssdw m0, m1
movu [dstq+widthq], m0
add widthq, mmsize
@@ -76,16 +66,24 @@ cglobal %1, 2, 2, 5, dst, width
;-----------------------------------------------------------------------------
; chrConvertRange
;
-; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
-; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
+; uint32_t coeff, int64_t offset);
+; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
+; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
-%macro CHRCONVERTRANGE 4
-cglobal %1, 3, 3, 7, dstU, dstV, width
+%macro CHRCONVERTRANGE 1
+cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
shl widthd, 1
- VBROADCASTI128 m4, [%2]
- VBROADCASTI128 m5, [%3]
+ movd xm4, coeffd
+ VBROADCASTSS m4, xm4
+%if ARCH_X86_64
+ movq xm5, offsetq
+%else
+ movq xm5, offsetm
+%endif
+ VBROADCASTSS m5, xm5
pxor m6, m6
add dstUq, widthq
add dstVq, widthq
@@ -105,10 +103,10 @@ cglobal %1, 3, 3, 7, dstU, dstV, width
paddd m1, m5
paddd m2, m5
paddd m3, m5
- psrad m0, %4
- psrad m1, %4
- psrad m2, %4
- psrad m3, %4
+ psrad m0, 14
+ psrad m1, 14
+ psrad m2, 14
+ psrad m3, 14
packssdw m0, m1
packssdw m2, m3
movu [dstUq+widthq], m0
@@ -119,15 +117,15 @@ cglobal %1, 3, 3, 7, dstU, dstV, width
%endmacro
INIT_XMM sse2
-LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
-CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
-LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
-CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+LUMCONVERTRANGE To
+CHRCONVERTRANGE To
+LUMCONVERTRANGE From
+CHRCONVERTRANGE From
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
-CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
-LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
-CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+LUMCONVERTRANGE To
+CHRCONVERTRANGE To
+LUMCONVERTRANGE From
+CHRCONVERTRANGE From
%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 2722c4bdc6..550ad99f3f 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -464,27 +464,26 @@ INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
} while (0)
#define RANGE_CONVERT_FUNCS_DECL(opt) \
-void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \
-void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
-void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \
-void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
+void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
+ uint32_t coeff, int64_t offset); \
RANGE_CONVERT_FUNCS_DECL(sse2);
RANGE_CONVERT_FUNCS_DECL(avx2);
av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
{
- /* This code is currently disabled because of changes in the base
- * implementation of these functions. This code should be enabled
- * again once those changes are ported to this architecture. */
-#if 0
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
RANGE_CONVERT_FUNCS(avx2);
} else if (EXTERNAL_SSE2(cpu_flags)) {
RANGE_CONVERT_FUNCS(sse2);
}
-#endif
}
av_cold void ff_sws_init_swscale_x86(SwsInternal *c)
--
2.39.5
More information about the ffmpeg-devel
mailing list