[FFmpeg-devel] [PATCH 2/2] swscale/x86/input: add AVX2 optimized uyvytoyuv422
James Almer
jamrial at gmail.com
Wed Jun 5 23:28:53 EEST 2024
uyvytoyuv422_c: 23991.8
uyvytoyuv422_sse2: 2817.8
uyvytoyuv422_avx: 2819.3
uyvytoyuv422_avx2: 1972.3
Signed-off-by: James Almer <jamrial at gmail.com>
---
libswscale/x86/rgb2rgb.c | 6 ++++++
libswscale/x86/rgb_2_rgb.asm | 32 ++++++++++++++++++++++++--------
2 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index b325e5dbd5..21ccfafe51 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -136,6 +136,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
#endif
av_cold void rgb2rgb_init_x86(void)
@@ -177,5 +180,8 @@ av_cold void rgb2rgb_init_x86(void)
if (EXTERNAL_AVX(cpu_flags)) {
uyvytoyuv422 = ff_uyvytoyuv422_avx;
}
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ uyvytoyuv422 = ff_uyvytoyuv422_avx2;
+ }
#endif
}
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 76ca1eec03..0bf1278718 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -34,13 +34,16 @@ pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
SECTION .text
-%macro RSHIFT_COPY 3
+%macro RSHIFT_COPY 5
; %1 dst ; %2 src ; %3 shift
-%if cpuflag(avx)
- psrldq %1, %2, %3
+%if mmsize == 32
+ vperm2i128 %1, %2, %3, %5
+ RSHIFT %1, %4
+%elif cpuflag(avx)
+ psrldq %1, %2, %4
%else
mova %1, %2
- RSHIFT %1, %3
+ RSHIFT %1, %4
%endif
%endmacro
@@ -233,26 +236,37 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
jge .end_line
.loop_simd:
+%if mmsize == 32
+ movu xm2, [srcq + wtwoq ]
+ movu xm3, [srcq + wtwoq + 16 ]
+ movu xm4, [srcq + wtwoq + 16 * 2]
+ movu xm5, [srcq + wtwoq + 16 * 3]
+ vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1
+ vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1
+ vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1
+ vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1
+%else
movu m2, [srcq + wtwoq ]
movu m3, [srcq + wtwoq + mmsize ]
movu m4, [srcq + wtwoq + mmsize * 2]
movu m5, [srcq + wtwoq + mmsize * 3]
+%endif
; extract y part 1
- RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
+ RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
pand m6, m1; YxYx YxYx...
- RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
+ RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
pand m7, m1 ; YxYx YxYx...
packuswb m6, m7 ; YYYY YYYY...
movu [ydstq + wq], m6
; extract y part 2
- RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
+ RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
pand m6, m1; YxYx YxYx...
- RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
+ RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
pand m7, m1 ; YxYx YxYx...
packuswb m6, m7 ; YYYY YYYY...
@@ -309,4 +323,6 @@ UYVY_TO_YUV422
INIT_XMM avx
UYVY_TO_YUV422
+INIT_YMM avx2
+UYVY_TO_YUV422
%endif
--
2.45.1
More information about the ffmpeg-devel
mailing list