[FFmpeg-devel] [PATCH] swscale/aarch64/output.S: refactor ff_yuv2plane1_8_neon
Krzysztof Pyrkosz
ffmpeg at szaka.eu
Fri Jan 31 23:20:03 EET 2025
The benchmarks (before vs after) were gathered using
./tests/checkasm/checkasm --test=sw_scale --bench --runs=6 | grep yuv2yuv1
A78 before:
yuv2yuv1_0_512_accurate_c: 2039.5 ( 1.00x)
yuv2yuv1_0_512_accurate_neon: 385.5 ( 5.29x)
yuv2yuv1_0_512_approximate_c: 2110.5 ( 1.00x)
yuv2yuv1_0_512_approximate_neon: 385.5 ( 5.47x)
yuv2yuv1_3_512_accurate_c: 2061.2 ( 1.00x)
yuv2yuv1_3_512_accurate_neon: 381.2 ( 5.41x)
yuv2yuv1_3_512_approximate_c: 2099.2 ( 1.00x)
yuv2yuv1_3_512_approximate_neon: 381.2 ( 5.51x)
yuv2yuv1_8_512_accurate_c: 2054.2 ( 1.00x)
yuv2yuv1_8_512_accurate_neon: 385.5 ( 5.33x)
yuv2yuv1_8_512_approximate_c: 2112.2 ( 1.00x)
yuv2yuv1_8_512_approximate_neon: 385.5 ( 5.48x)
yuv2yuv1_11_512_accurate_c: 2036.0 ( 1.00x)
yuv2yuv1_11_512_accurate_neon: 381.2 ( 5.34x)
yuv2yuv1_11_512_approximate_c: 2115.0 ( 1.00x)
yuv2yuv1_11_512_approximate_neon: 381.2 ( 5.55x)
yuv2yuv1_16_512_accurate_c: 2066.5 ( 1.00x)
yuv2yuv1_16_512_accurate_neon: 385.5 ( 5.36x)
yuv2yuv1_16_512_approximate_c: 2100.8 ( 1.00x)
yuv2yuv1_16_512_approximate_neon: 385.5 ( 5.45x)
yuv2yuv1_19_512_accurate_c: 2059.8 ( 1.00x)
yuv2yuv1_19_512_accurate_neon: 381.2 ( 5.40x)
yuv2yuv1_19_512_approximate_c: 2102.8 ( 1.00x)
yuv2yuv1_19_512_approximate_neon: 381.2 ( 5.52x)
After:
yuv2yuv1_0_512_accurate_c: 2206.0 ( 1.00x)
yuv2yuv1_0_512_accurate_neon: 139.2 (15.84x)
yuv2yuv1_0_512_approximate_c: 2050.0 ( 1.00x)
yuv2yuv1_0_512_approximate_neon: 139.2 (14.72x)
yuv2yuv1_3_512_accurate_c: 2205.2 ( 1.00x)
yuv2yuv1_3_512_accurate_neon: 138.0 (15.98x)
yuv2yuv1_3_512_approximate_c: 2052.5 ( 1.00x)
yuv2yuv1_3_512_approximate_neon: 138.0 (14.87x)
yuv2yuv1_8_512_accurate_c: 2171.0 ( 1.00x)
yuv2yuv1_8_512_accurate_neon: 139.2 (15.59x)
yuv2yuv1_8_512_approximate_c: 2064.2 ( 1.00x)
yuv2yuv1_8_512_approximate_neon: 139.2 (14.82x)
yuv2yuv1_11_512_accurate_c: 2164.8 ( 1.00x)
yuv2yuv1_11_512_accurate_neon: 138.0 (15.69x)
yuv2yuv1_11_512_approximate_c: 2048.8 ( 1.00x)
yuv2yuv1_11_512_approximate_neon: 138.0 (14.85x)
yuv2yuv1_16_512_accurate_c: 2154.5 ( 1.00x)
yuv2yuv1_16_512_accurate_neon: 139.2 (15.47x)
yuv2yuv1_16_512_approximate_c: 2047.2 ( 1.00x)
yuv2yuv1_16_512_approximate_neon: 139.2 (14.70x)
yuv2yuv1_19_512_accurate_c: 2144.5 ( 1.00x)
yuv2yuv1_19_512_accurate_neon: 138.0 (15.54x)
yuv2yuv1_19_512_approximate_c: 2046.0 ( 1.00x)
yuv2yuv1_19_512_approximate_neon: 138.0 (14.83x)
A72 before:
yuv2yuv1_0_512_accurate_c: 3779.8 ( 1.00x)
yuv2yuv1_0_512_accurate_neon: 527.8 ( 7.16x)
yuv2yuv1_0_512_approximate_c: 4128.2 ( 1.00x)
yuv2yuv1_0_512_approximate_neon: 528.2 ( 7.81x)
yuv2yuv1_3_512_accurate_c: 3836.2 ( 1.00x)
yuv2yuv1_3_512_accurate_neon: 527.0 ( 7.28x)
yuv2yuv1_3_512_approximate_c: 3991.0 ( 1.00x)
yuv2yuv1_3_512_approximate_neon: 526.8 ( 7.58x)
yuv2yuv1_8_512_accurate_c: 3732.8 ( 1.00x)
yuv2yuv1_8_512_accurate_neon: 525.5 ( 7.10x)
yuv2yuv1_8_512_approximate_c: 4060.0 ( 1.00x)
yuv2yuv1_8_512_approximate_neon: 527.0 ( 7.70x)
yuv2yuv1_11_512_accurate_c: 3836.2 ( 1.00x)
yuv2yuv1_11_512_accurate_neon: 530.0 ( 7.24x)
yuv2yuv1_11_512_approximate_c: 4014.0 ( 1.00x)
yuv2yuv1_11_512_approximate_neon: 530.0 ( 7.57x)
yuv2yuv1_16_512_accurate_c: 3726.2 ( 1.00x)
yuv2yuv1_16_512_accurate_neon: 525.5 ( 7.09x)
yuv2yuv1_16_512_approximate_c: 4114.2 ( 1.00x)
yuv2yuv1_16_512_approximate_neon: 526.2 ( 7.82x)
yuv2yuv1_19_512_accurate_c: 3812.2 ( 1.00x)
yuv2yuv1_19_512_accurate_neon: 530.0 ( 7.19x)
yuv2yuv1_19_512_approximate_c: 4012.2 ( 1.00x)
yuv2yuv1_19_512_approximate_neon: 530.0 ( 7.57x)
After:
yuv2yuv1_0_512_accurate_c: 3716.8 ( 1.00x)
yuv2yuv1_0_512_accurate_neon: 215.1 (17.28x)
yuv2yuv1_0_512_approximate_c: 3877.8 ( 1.00x)
yuv2yuv1_0_512_approximate_neon: 222.8 (17.40x)
yuv2yuv1_3_512_accurate_c: 3717.1 ( 1.00x)
yuv2yuv1_3_512_accurate_neon: 217.8 (17.06x)
yuv2yuv1_3_512_approximate_c: 3801.6 ( 1.00x)
yuv2yuv1_3_512_approximate_neon: 220.3 (17.25x)
yuv2yuv1_8_512_accurate_c: 3716.6 ( 1.00x)
yuv2yuv1_8_512_accurate_neon: 213.8 (17.38x)
yuv2yuv1_8_512_approximate_c: 3831.8 ( 1.00x)
yuv2yuv1_8_512_approximate_neon: 218.1 (17.57x)
yuv2yuv1_11_512_accurate_c: 3717.1 ( 1.00x)
yuv2yuv1_11_512_accurate_neon: 219.1 (16.97x)
yuv2yuv1_11_512_approximate_c: 3801.6 ( 1.00x)
yuv2yuv1_11_512_approximate_neon: 216.1 (17.59x)
yuv2yuv1_16_512_accurate_c: 3716.6 ( 1.00x)
yuv2yuv1_16_512_accurate_neon: 213.6 (17.40x)
yuv2yuv1_16_512_approximate_c: 3831.6 ( 1.00x)
yuv2yuv1_16_512_approximate_neon: 215.1 (17.82x)
yuv2yuv1_19_512_accurate_c: 3717.1 ( 1.00x)
yuv2yuv1_19_512_accurate_neon: 223.8 (16.61x)
yuv2yuv1_19_512_approximate_c: 3801.6 ( 1.00x)
yuv2yuv1_19_512_approximate_neon: 219.1 (17.35x)
x13s before:
yuv2yuv1_0_512_accurate_c: 1435.1 ( 1.00x)
yuv2yuv1_0_512_accurate_neon: 221.1 ( 6.49x)
yuv2yuv1_0_512_approximate_c: 1405.4 ( 1.00x)
yuv2yuv1_0_512_approximate_neon: 219.1 ( 6.41x)
yuv2yuv1_3_512_accurate_c: 1418.6 ( 1.00x)
yuv2yuv1_3_512_accurate_neon: 215.9 ( 6.57x)
yuv2yuv1_3_512_approximate_c: 1405.9 ( 1.00x)
yuv2yuv1_3_512_approximate_neon: 224.1 ( 6.27x)
yuv2yuv1_8_512_accurate_c: 1433.9 ( 1.00x)
yuv2yuv1_8_512_accurate_neon: 218.6 ( 6.56x)
yuv2yuv1_8_512_approximate_c: 1412.9 ( 1.00x)
yuv2yuv1_8_512_approximate_neon: 218.9 ( 6.46x)
yuv2yuv1_11_512_accurate_c: 1449.1 ( 1.00x)
yuv2yuv1_11_512_accurate_neon: 217.6 ( 6.66x)
yuv2yuv1_11_512_approximate_c: 1410.9 ( 1.00x)
yuv2yuv1_11_512_approximate_neon: 221.1 ( 6.38x)
yuv2yuv1_16_512_accurate_c: 1402.1 ( 1.00x)
yuv2yuv1_16_512_accurate_neon: 214.6 ( 6.53x)
yuv2yuv1_16_512_approximate_c: 1422.4 ( 1.00x)
yuv2yuv1_16_512_approximate_neon: 222.9 ( 6.38x)
yuv2yuv1_19_512_accurate_c: 1421.6 ( 1.00x)
yuv2yuv1_19_512_accurate_neon: 217.4 ( 6.54x)
yuv2yuv1_19_512_approximate_c: 1421.6 ( 1.00x)
yuv2yuv1_19_512_approximate_neon: 221.4 ( 6.42x)
After:
yuv2yuv1_0_512_accurate_c: 1413.6 ( 1.00x)
yuv2yuv1_0_512_accurate_neon: 80.6 (17.53x)
yuv2yuv1_0_512_approximate_c: 1455.6 ( 1.00x)
yuv2yuv1_0_512_approximate_neon: 80.6 (18.05x)
yuv2yuv1_3_512_accurate_c: 1429.1 ( 1.00x)
yuv2yuv1_3_512_accurate_neon: 77.4 (18.47x)
yuv2yuv1_3_512_approximate_c: 1462.6 ( 1.00x)
yuv2yuv1_3_512_approximate_neon: 80.6 (18.14x)
yuv2yuv1_8_512_accurate_c: 1425.4 ( 1.00x)
yuv2yuv1_8_512_accurate_neon: 77.9 (18.30x)
yuv2yuv1_8_512_approximate_c: 1436.6 ( 1.00x)
yuv2yuv1_8_512_approximate_neon: 80.9 (17.76x)
yuv2yuv1_11_512_accurate_c: 1429.4 ( 1.00x)
yuv2yuv1_11_512_accurate_neon: 76.1 (18.78x)
yuv2yuv1_11_512_approximate_c: 1447.1 ( 1.00x)
yuv2yuv1_11_512_approximate_neon: 78.4 (18.46x)
yuv2yuv1_16_512_accurate_c: 1439.9 ( 1.00x)
yuv2yuv1_16_512_accurate_neon: 77.6 (18.55x)
yuv2yuv1_16_512_approximate_c: 1422.1 ( 1.00x)
yuv2yuv1_16_512_approximate_neon: 78.1 (18.20x)
yuv2yuv1_19_512_accurate_c: 1447.1 ( 1.00x)
yuv2yuv1_19_512_accurate_neon: 78.1 (18.52x)
yuv2yuv1_19_512_approximate_c: 1474.4 ( 1.00x)
yuv2yuv1_19_512_approximate_neon: 78.1 (18.87x)
Krzysztof
---
libswscale/aarch64/output.S | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 934d62dfd0..190c438870 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -214,21 +214,15 @@ function ff_yuv2plane1_8_neon, export=1
and w4, w4, #7
cbz w4, 1f // check if offsetting present
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
-1: uxtl v0.8h, v0.8b // extend dither to 32-bit
- uxtl v1.4s, v0.4h
- uxtl2 v2.4s, v0.8h
+1:
+ uxtl v0.8h, v0.8b // extend dither to 32-bit
2:
ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
- sxtl v4.4s, v3.4h
- sxtl2 v5.4s, v3.8h
- add v4.4s, v4.4s, v1.4s
- add v5.4s, v5.4s, v2.4s
- sqshrun v4.4h, v4.4s, #6
- sqshrun2 v4.8h, v5.4s, #6
-
- uqshrn v3.8b, v4.8h, #1 // clip8(val>>7)
subs w2, w2, #8 // dstW -= 8
- st1 {v3.8b}, [x1], #8 // write to destination
+ shadd v1.8h, v0.8h, v3.8h // v1 = (v0 + v3) >> 1
+ sqshrun v2.8b, v1.8h, #6 // clip_uint8(v1 >> 6)
+
+ st1 {v2.8b}, [x1], #8 // write to destination
b.gt 2b // loop until width consumed
ret
endfunc
--
2.45.2
More information about the ffmpeg-devel
mailing list