[FFmpeg-devel] [PATCH v2 16/16] swscale/aarch64: add neon {lum, chr}ConvertRange16

Fri Sep 27 15:52:41 EEST 2024

A55              A76
chrRangeFromJpeg16_1920_c:     28840.6           6323.5
chrRangeFromJpeg16_1920_neon:   8436.5 ( 3.42x)  3365.2 ( 1.88x)
chrRangeToJpeg16_1920_c:       23075.1           9195.6
chrRangeToJpeg16_1920_neon:     9393.6 ( 2.46x)  4084.5 ( 2.25x)
lumRangeFromJpeg16_1920_c:     15383.8           4436.8
lumRangeFromJpeg16_1920_neon:   4586.0 ( 3.35x)  1814.0 ( 2.45x)
lumRangeToJpeg16_1920_c:       19225.5           6017.2
lumRangeToJpeg16_1920_neon:     5067.9 ( 3.79x)  2146.4 ( 2.80x)
---
 libswscale/aarch64/range_convert_neon.S | 98 +++++++++++++++++++++++--
 libswscale/aarch64/swscale.c            | 36 ++++++---
 2 files changed, 116 insertions(+), 18 deletions(-)

diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
index 1aadd8e04d..f1812301ed 100644
--- a/libswscale/aarch64/range_convert_neon.S
+++ b/libswscale/aarch64/range_convert_neon.S
@@ -20,12 +20,42 @@
 
 #include "libavutil/aarch64/asm.S"
 
-.macro lumConvertRange fromto
-function ff_lumRange\fromto\()Jpeg_neon, export=1
+.macro lumConvertRange fromto, bit_depth
+function ff_lumRange\fromto\()Jpeg\bit_depth\()_neon, export=1
 // x0  int16_t *dst
 // w1  int width
 // w2  int coeff
 // x3  int64_t offset
+.if \bit_depth == 16
+.ifc \fromto, To
+        movi            v25.4s, #1
+        movi            v24.4s, #1<<3, lsl #16
+        sub             v24.4s, v24.4s, v25.4s
+.endif
+        dup             v25.4s, w2
+        dup             v26.2d, x3
+1:
+        ld1             {v0.4s, v1.4s}, [x0]
+        mov             v16.16b, v26.16b
+        mov             v17.16b, v26.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v26.16b
+        smlal           v16.2d, v0.2s, v25.2s
+        smlal2          v17.2d, v0.4s, v25.4s
+        smlal           v18.2d, v1.2s, v25.2s
+        smlal2          v19.2d, v1.4s, v25.4s
+        shrn            v0.2s, v16.2d, 18
+        shrn2           v0.4s, v17.2d, 18
+        shrn            v1.2s, v18.2d, 18
+        shrn2           v1.4s, v19.2d, 18
+        subs            w1, w1, #8
+.ifc \fromto, To
+        smin            v0.4s, v0.4s, v24.4s
+        smin            v1.4s, v1.4s, v24.4s
+.endif
+        st1             {v0.4s, v1.4s}, [x0], #32
+        b.gt            1b
+.else
         dup             v25.4s, w2
         dup             v26.4s, w3
 1:
@@ -46,17 +76,64 @@ function ff_lumRange\fromto\()Jpeg_neon, export=1
         subs            w1, w1, #8
         st1             {v0.8h}, [x0], #16
         b.gt            1b
+.endif
         ret
 endfunc
 .endm
 
-.macro chrConvertRange fromto
-function ff_chrRange\fromto\()Jpeg_neon, export=1
+.macro chrConvertRange fromto, bit_depth
+function ff_chrRange\fromto\()Jpeg\bit_depth\()_neon, export=1
 // x0  int16_t *dstU
 // x1  int16_t *dstV
 // w2  int width
 // w3  int coeff
 // x4  int64_t offset
+.if \bit_depth == 16
+.ifc \fromto, To
+        movi            v25.4s, #1
+        movi            v24.4s, #1<<3, lsl #16
+        sub             v24.4s, v24.4s, v25.4s
+.endif
+        dup             v25.4s, w3
+        dup             v26.2d, x4
+1:
+        ld1             {v0.4s, v1.4s}, [x0]
+        ld1             {v2.4s, v3.4s}, [x1]
+        mov             v16.16b, v26.16b
+        mov             v17.16b, v26.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v26.16b
+        mov             v20.16b, v26.16b
+        mov             v21.16b, v26.16b
+        mov             v22.16b, v26.16b
+        mov             v23.16b, v26.16b
+        smlal           v16.2d, v0.2s, v25.2s
+        smlal2          v17.2d, v0.4s, v25.4s
+        smlal           v18.2d, v1.2s, v25.2s
+        smlal2          v19.2d, v1.4s, v25.4s
+        smlal           v20.2d, v2.2s, v25.2s
+        smlal2          v21.2d, v2.4s, v25.4s
+        smlal           v22.2d, v3.2s, v25.2s
+        smlal2          v23.2d, v3.4s, v25.4s
+        shrn            v0.2s, v16.2d, 18
+        shrn2           v0.4s, v17.2d, 18
+        shrn            v1.2s, v18.2d, 18
+        shrn2           v1.4s, v19.2d, 18
+        shrn            v2.2s, v20.2d, 18
+        shrn2           v2.4s, v21.2d, 18
+        shrn            v3.2s, v22.2d, 18
+        shrn2           v3.4s, v23.2d, 18
+        subs            w2, w2, #8
+.ifc \fromto, To
+        smin            v0.4s, v0.4s, v24.4s
+        smin            v1.4s, v1.4s, v24.4s
+        smin            v2.4s, v2.4s, v24.4s
+        smin            v3.4s, v3.4s, v24.4s
+.endif
+        st1             {v0.4s, v1.4s}, [x0], #32
+        st1             {v2.4s, v3.4s}, [x1], #32
+        b.gt            1b
+.else
         dup             v25.4s, w3
         dup             v26.4s, w4
 1:
@@ -89,11 +166,16 @@ function ff_chrRange\fromto\()Jpeg_neon, export=1
         st1             {v0.8h}, [x0], #16
         st1             {v1.8h}, [x1], #16
         b.gt            1b
+.endif
         ret
 endfunc
 .endm
 
-lumConvertRange To
-chrConvertRange To
-lumConvertRange From
-chrConvertRange From
+lumConvertRange To,    8
+lumConvertRange To,   16
+chrConvertRange To,    8
+chrConvertRange To,   16
+lumConvertRange From,  8
+lumConvertRange From, 16
+chrConvertRange From,  8
+chrConvertRange From, 16
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 98f07ecfe5..55d8ffc281 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -218,14 +218,22 @@ NEON_INPUT(bgra32);
 NEON_INPUT(rgb24);
 NEON_INPUT(rgba32);
 
-void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
+void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
+                               int coeff, int64_t offset);
+void ff_chrRangeFromJpeg8_neon(int16_t *dstU, int16_t *dstV, int width,
+                               int coeff, int64_t offset);
+void ff_lumRangeToJpeg8_neon(int16_t *dst, int width,
+                             int coeff, int64_t offset);
+void ff_chrRangeToJpeg8_neon(int16_t *dstU, int16_t *dstV, int width,
+                             int coeff, int64_t offset);
+void ff_lumRangeFromJpeg16_neon(int16_t *dst, int width,
+                                int coeff, int64_t offset);
+void ff_chrRangeFromJpeg16_neon(int16_t *dstU, int16_t *dstV, int width,
+                                int coeff, int64_t offset);
+void ff_lumRangeToJpeg16_neon(int16_t *dst, int width,
                               int coeff, int64_t offset);
-void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
+void ff_chrRangeToJpeg16_neon(int16_t *dstU, int16_t *dstV, int width,
                               int coeff, int64_t offset);
-void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
-                            int coeff, int64_t offset);
-void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
-                            int coeff, int64_t offset);
 
 av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
 {
@@ -234,11 +242,19 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
     if (have_neon(cpu_flags)) {
         if (c->dstBpc <= 14) {
             if (c->srcRange) {
-                c->lumConvertRange = ff_lumRangeFromJpeg_neon;
-                c->chrConvertRange = ff_chrRangeFromJpeg_neon;
+                c->lumConvertRange = ff_lumRangeFromJpeg8_neon;
+                c->chrConvertRange = ff_chrRangeFromJpeg8_neon;
             } else {
-                c->lumConvertRange = ff_lumRangeToJpeg_neon;
-                c->chrConvertRange = ff_chrRangeToJpeg_neon;
+                c->lumConvertRange = ff_lumRangeToJpeg8_neon;
+                c->chrConvertRange = ff_chrRangeToJpeg8_neon;
+            }
+        } else {
+            if (c->srcRange) {
+                c->lumConvertRange = ff_lumRangeFromJpeg16_neon;
+                c->chrConvertRange = ff_chrRangeFromJpeg16_neon;
+            } else {
+                c->lumConvertRange = ff_lumRangeToJpeg16_neon;
+                c->chrConvertRange = ff_chrRangeToJpeg16_neon;
             }
         }
     }
-- 
2.30.2