[FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv
Zhao Zhili
quinkblack at foxmail.com
Wed Jun 19 12:24:28 EEST 2024
> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi at remlab.net> wrote:
>
>
>
> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack at foxmail.com> a écrit :
>> From: Zhao Zhili <zhilizhao at tencent.com>
>>
>> Test on Apple M1 with kperf
>>
>> bgra_to_uv_8_c: 13.4
>> bgra_to_uv_8_neon: 37.4
>> bgra_to_uv_128_c: 155.9
>> bgra_to_uv_128_neon: 91.7
>> bgra_to_uv_1080_c: 1173.2
>> bgra_to_uv_1080_neon: 822.7
>> bgra_to_uv_1920_c: 2078.2
>> bgra_to_uv_1920_neon: 1437.7
>> bgra_to_uv_half_8_c: 17.9
>> bgra_to_uv_half_8_neon: 37.4
>> bgra_to_uv_half_128_c: 103.9
>> bgra_to_uv_half_128_neon: 73.9
>> bgra_to_uv_half_1080_c: 850.2
>> bgra_to_uv_half_1080_neon: 484.2
>> bgra_to_uv_half_1920_c: 1479.2
>> bgra_to_uv_half_1920_neon: 824.2
>> bgra_to_y_8_c: 8.2
>> bgra_to_y_8_neon: 18.2
>> bgra_to_y_128_c: 101.4
>> bgra_to_y_128_neon: 74.9
>> bgra_to_y_1080_c: 739.4
>> bgra_to_y_1080_neon: 613.4
>> bgra_to_y_1920_c: 1298.7
>> bgra_to_y_1920_neon: 918.7
>> ---
>> libswscale/aarch64/input.S | 81 +++++++++++++++++++++++++++++++-----
>> libswscale/aarch64/swscale.c | 16 +++++++
>> 2 files changed, 86 insertions(+), 11 deletions(-)
>>
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> index 2b956fe5c2..37f1158504 100644
>> --- a/libswscale/aarch64/input.S
>> +++ b/libswscale/aarch64/input.S
>> @@ -20,8 +20,12 @@
>>
>> #include "libavutil/aarch64/asm.S"
>>
>> -.macro rgb_to_yuv_load_rgb src
>> +.macro rgb_to_yuv_load_rgb src, element=3
>> + .if \element == 3
>> ld3 { v16.16b, v17.16b, v18.16b }, [\src]
>> + .else
>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>> + .endif
>> uxtl v19.8h, v16.8b // v19: r
>> uxtl v20.8h, v17.8b // v20: g
>> uxtl v21.8h, v18.8b // v21: b
>> @@ -43,7 +47,7 @@
>> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
>> .endm
>>
>> -.macro rgbToY bgr
>> +.macro rgbToY bgr, element=3
>
> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop.
I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following:
function ff_bgr24ToUV_half_neon, export=1
ldr w12, [x6, #12]
ldr w11, [x6, #16]
ldr w10, [x6, #20]
ldr w15, [x6, #24]
ldr w14, [x6, #28]
ldr w13, [x6, #32]
rgbToUV_half
endfunc
>
>> cmp w4, #0 // check width > 0
>> .if \bgr
>> ldr w12, [x5] // w12: ry
>> @@ -67,11 +71,15 @@
>> dup v2.8h, w12
>> b.lt 2f
>> 1:
>> - rgb_to_yuv_load_rgb x1
>> + rgb_to_yuv_load_rgb x1, \element
>> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> sub w4, w4, #16 // width -= 16
>> + .if \element == 3
>> add x1, x1, #48 // src += 48
>> + .else
>> + add x1, x1, #64
>> + .endif
>> cmp w4, #16 // width >= 16 ?
>> stp q16, q17, [x0], #32 // store to dst
>> b.ge 1b
>> @@ -86,7 +94,7 @@
>> smaddl x13, w15, w12, x13 // x13 += by * b
>> asr w13, w13, #9 // x13 >>= 9
>> sub w4, w4, #1 // width--
>> - add x1, x1, #3 // src += 3
>> + add x1, x1, \element
>> strh w13, [x0], #2 // store to dst
>> cbnz w4, 2b
>> 3:
>> @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1
>> rgbToY bgr=1
>> endfunc
>>
>> +function ff_rgba32ToY_neon, export=1
>> + rgbToY bgr=0, element=4
>> +endfunc
>> +
>> +function ff_bgra32ToY_neon, export=1
>> + rgbToY bgr=1, element=4
>> +endfunc
>> +
>> .macro rgb_load_uv_coeff half, bgr
>> .if \bgr
>> ldr w12, [x6, #12]
>> @@ -130,7 +146,7 @@ endfunc
>> dup v6.4s, w9
>> .endm
>>
>> -.macro rgbToUV_half bgr
>> +.macro rgbToUV_half bgr, element=3
>> cmp w5, #0 // check width > 0
>> b.le 3f
>>
>> @@ -139,7 +155,11 @@ endfunc
>> b.lt 2f
>> // The following comments assume RGB order. The logic for RGB and BGR is the same.
>> 1:
>> + .if \element == 3
>> ld3 { v16.16b, v17.16b, v18.16b }, [x3]
>> + .else
>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
>> + .endif
>> uaddlp v19.8h, v16.16b // v19: r
>> uaddlp v20.8h, v17.16b // v20: g
>> uaddlp v21.8h, v18.16b // v21: b
>> @@ -147,7 +167,11 @@ endfunc
>> rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>> rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>> sub w5, w5, #8 // width -= 8
>> - add x3, x3, #48 // src += 48
>> + .if \element == 3
>> + add x3, x3, #48
>> + .else
>> + add x3, x3, #64
>> + .endif
>> cmp w5, #8 // width >= 8 ?
>> str q16, [x0], #16 // store dst_u
>> str q17, [x1], #16 // store dst_v
>> @@ -155,9 +179,10 @@ endfunc
>> cbz w5, 3f
>> 2:
>> ldrb w2, [x3] // w2: r1
>> - ldrb w4, [x3, #3] // w4: r2
>> + ldrb w4, [x3, \element] // w4: r2
>> add w2, w2, w4 // w2 = r1 + r2
>>
>> + .if \element == 3
>> ldrb w4, [x3, #1] // w4: g1
>> ldrb w7, [x3, #4] // w7: g2
>> add w4, w4, w7 // w4 = g1 + g2
>> @@ -165,6 +190,15 @@ endfunc
>> ldrb w7, [x3, #2] // w7: b1
>> ldrb w8, [x3, #5] // w8: b2
>> add w7, w7, w8 // w7 = b1 + b2
>> + .else
>> + ldrb w4, [x3, #1] // w4: g1
>> + ldrb w7, [x3, #5] // w7: g2
>> + add w4, w4, w7 // w4 = g1 + g2
>> +
>> + ldrb w7, [x3, #2] // w7: b1
>> + ldrb w8, [x3, #6] // w8: b2
>> + add w7, w7, w8 // w7 = b1 + b2
>> + .endif
>>
>> smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset
>> smaddl x8, w4, w11, x8 // dst_u += gu * g
>> @@ -177,7 +211,12 @@ endfunc
>> smaddl x8, w7, w15, x8 // dst_v += bv * b
>> asr x8, x8, #10 // dst_v >>= 10
>> sub w5, w5, #1
>> - add x3, x3, #6 // src += 6
>> + ldrb w4, [x3, #1] // w4: g1
>> + .if \element == 3
>> + add x3, x3, #6
>> + .else
>> + add x3, x3, #8
>> + .endif
>> strh w8, [x1], #2 // store dst_v
>> cbnz w5, 2b
>> 3:
>> @@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1
>> rgbToUV_half bgr=1
>> endfunc
>>
>> -.macro rgbToUV bgr
>> +function ff_rgba32ToUV_half_neon, export=1
>> + rgbToUV_half bgr=0, element=4
>> +endfunc
>> +
>> +function ff_bgra32ToUV_half_neon, export=1
>> + rgbToUV_half bgr=1, element=4
>> +endfunc
>> +
>> +.macro rgbToUV bgr, element=3
>> cmp w5, #0 // check width > 0
>> b.le 3f
>>
>> @@ -201,13 +248,17 @@ endfunc
>> b.lt 2f
>> // The following comments assume RGB order. The logic for RGB and BGR is the same.
>> 1:
>> - rgb_to_yuv_load_rgb x3
>> + rgb_to_yuv_load_rgb x3, \element
>> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>> rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
>> rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
>> sub w5, w5, #16
>> + .if \element == 3
>> add x3, x3, #48 // src += 48
>> + .else
>> + add x3, x3, #64
>> + .endif
>> cmp w5, #16
>> stp q16, q17, [x0], #32 // store to dst_u
>> stp q18, q19, [x1], #32 // store to dst_v
>> @@ -229,7 +280,7 @@ endfunc
>> smaddl x8, w4, w15, x8 // x8 += bv * b
>> asr w8, w8, #9 // x8 >>= 9
>> sub w5, w5, #1 // width--
>> - add x3, x3, #3 // src += 3
>> + add x3, x3, \element
>> strh w8, [x1], #2 // store to dst_v
>> cbnz w5, 2b
>> 3:
>> @@ -243,3 +294,11 @@ endfunc
>> function ff_bgr24ToUV_neon, export=1
>> rgbToUV bgr=1
>> endfunc
>> +
>> +function ff_rgba32ToUV_neon, export=1
>> + rgbToUV bgr=0, element=4
>> +endfunc
>> +
>> +function ff_bgra32ToUV_neon, export=1
>> + rgbToUV bgr=1, element=4
>> +endfunc
>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>> index ce70dbedcc..8fe9fb11ac 100644
>> --- a/libswscale/aarch64/swscale.c
>> +++ b/libswscale/aarch64/swscale.c
>> @@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
>> uint32_t *coeffs, void *)
>>
>> NEON_INPUT(bgr24);
>> +NEON_INPUT(bgra32);
>> NEON_INPUT(rgb24);
>> +NEON_INPUT(rgba32);
>>
>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> {
>> @@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> else
>> c->chrToYV12 = ff_bgr24ToUV_neon;
>> break;
>> + case AV_PIX_FMT_BGRA:
>> + c->lumToYV12 = ff_bgra32ToY_neon;
>> + if (c->chrSrcHSubSample)
>> + c->chrToYV12 = ff_bgra32ToUV_half_neon;
>> + else
>> + c->chrToYV12 = ff_bgra32ToUV_neon;
>> + break;
>> case AV_PIX_FMT_RGB24:
>> c->lumToYV12 = ff_rgb24ToY_neon;
>> if (c->chrSrcHSubSample)
>> @@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> else
>> c->chrToYV12 = ff_rgb24ToUV_neon;
>> break;
>> + case AV_PIX_FMT_RGBA:
>> + c->lumToYV12 = ff_rgba32ToY_neon;
>> + if (c->chrSrcHSubSample)
>> + c->chrToYV12 = ff_rgba32ToUV_half_neon;
>> + else
>> + c->chrToYV12 = ff_rgba32ToUV_neon;
>> + break;
>> default:
>> break;
>> }
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
More information about the ffmpeg-devel
mailing list