[FFmpeg-devel] [PATCH 3/6] avcodec/vc1: Arm 64-bit NEON inverse transform fast paths

Ben Avison bavison at riscosopen.org
Thu Mar 17 20:58:16 EET 2022


Signed-off-by: Ben Avison <bavison at riscosopen.org>
---
 libavcodec/aarch64/vc1dsp_init_aarch64.c |  19 +
 libavcodec/aarch64/vc1dsp_neon.S         | 678 +++++++++++++++++++++++
 2 files changed, 697 insertions(+)

diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index edfb296b75..b672b2aa99 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -25,6 +25,16 @@
 
 #include "config.h"
 
+void ff_vc1_inv_trans_8x8_neon(int16_t *block);
+void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
 void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
 void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
 void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
@@ -46,6 +56,15 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags)) {
+        dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
+        dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
+        dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
+        dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
+        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
+        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
+        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+
         dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
         dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
         dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
index fe8963545a..c3ca3eae1e 100644
--- a/libavcodec/aarch64/vc1dsp_neon.S
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -22,7 +22,685 @@
 
 #include "libavutil/aarch64/asm.S"
 
+// VC-1 8x8 inverse transform
+// On entry:
+//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
+// On exit:
+//   array at x0 updated to hold transformed block; also now held in row-major order
+function ff_vc1_inv_trans_8x8_neon, export=1
+        ld1     {v1.16b, v2.16b}, [x0], #32
+        ld1     {v3.16b, v4.16b}, [x0], #32
+        ld1     {v5.16b, v6.16b}, [x0], #32
+        shl     v1.8h, v1.8h, #2        //         8/2 * src[0]
+        sub     x1, x0, #3*32
+        ld1     {v16.16b, v17.16b}, [x0]
+        shl     v7.8h, v2.8h, #4        //          16 * src[8]
+        shl     v18.8h, v2.8h, #2       //           4 * src[8]
+        shl     v19.8h, v4.8h, #4       //                        16 * src[24]
+        ldr     d0, .Lcoeffs_it8
+        shl     v5.8h, v5.8h, #2        //                                      8/2 * src[32]
+        shl     v20.8h, v6.8h, #4       //                                       16 * src[40]
+        shl     v21.8h, v6.8h, #2       //                                        4 * src[40]
+        shl     v22.8h, v17.8h, #4      //                                                      16 * src[56]
+        ssra    v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
+        mul     v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
+        sub     v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
+        ssra    v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
+        sub     v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
+        shl     v3.8h, v3.8h, #3        //                      16/2 * src[16]
+        mls     v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
+        ssra    v1.8h, v1.8h, #1        //        12/2 * src[0]
+        ssra    v5.8h, v5.8h, #1        //                                     12/2 * src[32]
+        mla     v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
+        shl     v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
+        mls     v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
+        sub     v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
+        mla     v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
+        add     v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
+        sub     v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
+        mla     v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
+        mla     v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
+        add     v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
+        sub     v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
+        mla     v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
+        add     v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
+        add     v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
+        mls     v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
+        sub     v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
+        add     v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
+        mls     v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
+        sub     v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
+        sub     v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
+        neg     v3.8h, v7.8h            // -t1
+        neg     v4.8h, v20.8h           // +t2
+        neg     v6.8h, v19.8h           // +t3
+        ssra    v22.8h, v7.8h, #1       // (t5 + t1) >> 1
+        ssra    v1.8h, v19.8h, #1       // (t7 - t3) >> 1
+        neg     v7.8h, v18.8h           // +t4
+        ssra    v5.8h, v4.8h, #1        // (t6 + t2) >> 1
+        ssra    v16.8h, v6.8h, #1       // (t7 + t3) >> 1
+        ssra    v2.8h, v18.8h, #1       // (t8 - t4) >> 1
+        ssra    v17.8h, v7.8h, #1       // (t8 + t4) >> 1
+        ssra    v21.8h, v20.8h, #1      // (t6 - t2) >> 1
+        ssra    v23.8h, v3.8h, #1       // (t5 - t1) >> 1
+        srshr   v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
+        srshr   v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
+        srshr   v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
+        srshr   v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
+        srshr   v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
+        srshr   v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
+        srshr   v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
+        srshr   v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
+        trn2    v17.8h, v3.8h, v4.8h
+        trn2    v18.8h, v5.8h, v6.8h
+        trn2    v19.8h, v2.8h, v1.8h
+        trn2    v20.8h, v7.8h, v16.8h
+        trn1    v21.4s, v17.4s, v18.4s
+        trn2    v17.4s, v17.4s, v18.4s
+        trn1    v18.4s, v19.4s, v20.4s
+        trn2    v19.4s, v19.4s, v20.4s
+        trn1    v3.8h, v3.8h, v4.8h
+        trn2    v4.2d, v21.2d, v18.2d
+        trn1    v20.2d, v17.2d, v19.2d
+        trn1    v5.8h, v5.8h, v6.8h
+        trn1    v1.8h, v2.8h, v1.8h
+        trn1    v2.8h, v7.8h, v16.8h
+        trn1    v6.2d, v21.2d, v18.2d
+        trn2    v7.2d, v17.2d, v19.2d
+        shl     v16.8h, v20.8h, #4      //                        16 * src[24]
+        shl     v17.8h, v4.8h, #4       //                                       16 * src[40]
+        trn1    v18.4s, v3.4s, v5.4s
+        trn1    v19.4s, v1.4s, v2.4s
+        shl     v21.8h, v7.8h, #4       //                                                      16 * src[56]
+        shl     v22.8h, v6.8h, #2       //           4 * src[8]
+        shl     v23.8h, v4.8h, #2       //                                        4 * src[40]
+        trn2    v3.4s, v3.4s, v5.4s
+        trn2    v1.4s, v1.4s, v2.4s
+        shl     v2.8h, v6.8h, #4        //          16 * src[8]
+        sub     v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
+        ssra    v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
+        sub     v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
+        trn1    v22.2d, v18.2d, v19.2d
+        trn2    v18.2d, v18.2d, v19.2d
+        trn1    v19.2d, v3.2d, v1.2d
+        ssra    v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
+        mls     v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
+        shl     v21.8h, v22.8h, #2      //         8/2 * src[0]
+        shl     v18.8h, v18.8h, #2      //                                      8/2 * src[32]
+        mls     v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
+        shl     v6.8h, v19.8h, #3       //                      16/2 * src[16]
+        trn2    v1.2d, v3.2d, v1.2d
+        mla     v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
+        ssra    v21.8h, v21.8h, #1      //        12/2 * src[0]
+        ssra    v18.8h, v18.8h, #1      //                                     12/2 * src[32]
+        mul     v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
+        shl     v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
+        mla     v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
+        add     v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
+        mla     v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
+        sub     v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
+        sub     v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
+        mla     v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
+        mls     v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
+        add     v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
+        add     v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
+        mls     v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
+        sub     v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
+        neg     v21.8h, v17.8h          // +t2
+        mla     v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
+        sub     v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
+        neg     v4.8h, v5.8h            // +t3
+        sub     v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
+        sub     v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
+        neg     v24.8h, v16.8h          // +t4
+        add     v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
+        add     v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
+        ssra    v7.8h, v21.8h, #1       // (t6 + t2) >> 1
+        neg     v3.8h, v2.8h            // -t1
+        ssra    v18.8h, v2.8h, #1       // (t5 + t1) >> 1
+        ssra    v19.8h, v4.8h, #1       // (t7 + t3) >> 1
+        ssra    v0.8h, v24.8h, #1       // (t8 + t4) >> 1
+        srsra   v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
+        srsra   v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
+        srsra   v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
+        srsra   v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
+        srshr   v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
+        srshr   v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
+        srshr   v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
+        srshr   v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
+        srshr   v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
+        srshr   v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
+        st1     {v2.16b, v3.16b}, [x1], #32
+        srshr   v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
+        srshr   v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
+        st1     {v4.16b, v5.16b}, [x1], #32
+        st1     {v16.16b, v17.16b}, [x1], #32
+        st1     {v0.16b, v1.16b}, [x1]
+        ret
+endfunc
+
+// VC-1 8x4 inverse transform
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x4_neon, export=1
+        ld1     {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
+        mov     x3, x0
+        ld1     {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
+        ldr     q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
+        ld1     {v5.8b}, [x0], x1
+        trn2    v6.4h, v1.4h, v3.4h
+        trn2    v7.4h, v2.4h, v4.4h
+        trn1    v1.4h, v1.4h, v3.4h
+        trn1    v2.4h, v2.4h, v4.4h
+        trn2    v3.4h, v16.4h, v18.4h
+        trn2    v4.4h, v17.4h, v19.4h
+        trn1    v16.4h, v16.4h, v18.4h
+        trn1    v17.4h, v17.4h, v19.4h
+        ld1     {v18.8b}, [x0], x1
+        trn1    v19.2s, v6.2s, v3.2s
+        trn2    v3.2s, v6.2s, v3.2s
+        trn1    v6.2s, v7.2s, v4.2s
+        trn2    v4.2s, v7.2s, v4.2s
+        trn1    v7.2s, v1.2s, v16.2s
+        trn1    v20.2s, v2.2s, v17.2s
+        shl     v21.4h, v19.4h, #4      //          16 * src[1]
+        trn2    v1.2s, v1.2s, v16.2s
+        shl     v16.4h, v3.4h, #4       //                        16 * src[3]
+        trn2    v2.2s, v2.2s, v17.2s
+        shl     v17.4h, v6.4h, #4       //                                      16 * src[5]
+        ld1     {v22.8b}, [x0], x1
+        shl     v23.4h, v4.4h, #4       //                                                    16 * src[7]
+        mul     v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
+        ld1     {v25.8b}, [x0]
+        shl     v26.4h, v19.4h, #2      //           4 * src[1]
+        shl     v27.4h, v6.4h, #2       //                                       4 * src[5]
+        ssra    v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
+        ssra    v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
+        sub     v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
+        sub     v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
+        shl     v7.4h, v7.4h, #2        //         8/2 * src[0]
+        shl     v20.4h, v20.4h, #2      //                                     8/2 * src[4]
+        mla     v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
+        shl     v1.4h, v1.4h, #3        //                      16/2 * src[2]
+        mls     v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
+        ssra    v7.4h, v7.4h, #1        //        12/2 * src[0]
+        mls     v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
+        ssra    v20.4h, v20.4h, #1      //                                    12/2 * src[4]
+        mla     v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
+        shl     v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
+        mla     v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
+        mla     v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
+        mla     v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
+        sub     v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
+        mls     v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
+        add     v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
+        mls     v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
+        sub     v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
+        neg     v6.4h, v21.4h           // -t1
+        add     v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
+        sub     v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
+        add     v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
+        sub     v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
+        add     v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
+        add     v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
+        sub     v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
+        sub     v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
+        neg     v3.4h, v17.4h           // +t2
+        neg     v4.4h, v16.4h           // +t3
+        neg     v28.4h, v23.4h          // +t4
+        ssra    v7.4h, v21.4h, #1       // (t5 + t1) >> 1
+        ssra    v1.4h, v23.4h, #1       // (t8 - t4) >> 1
+        ssra    v20.4h, v3.4h, #1       // (t6 + t2) >> 1
+        ssra    v24.4h, v4.4h, #1       // (t7 + t3) >> 1
+        ssra    v19.4h, v28.4h, #1      // (t8 + t4) >> 1
+        ssra    v2.4h, v16.4h, #1       // (t7 - t3) >> 1
+        ssra    v27.4h, v17.4h, #1      // (t6 - t2) >> 1
+        ssra    v26.4h, v6.4h, #1       // (t5 - t1) >> 1
+        trn1    v1.2d, v7.2d, v1.2d
+        trn1    v2.2d, v20.2d, v2.2d
+        trn1    v3.2d, v24.2d, v27.2d
+        trn1    v4.2d, v19.2d, v26.2d
+        srshr   v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
+        srshr   v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
+        srshr   v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
+        srshr   v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
+        trn2    v6.8h, v1.8h, v2.8h
+        trn1    v1.8h, v1.8h, v2.8h
+        trn2    v2.8h, v3.8h, v4.8h
+        trn1    v3.8h, v3.8h, v4.8h
+        trn2    v4.4s, v6.4s, v2.4s
+        trn1    v7.4s, v1.4s, v3.4s
+        trn2    v1.4s, v1.4s, v3.4s
+        mul     v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
+        trn1    v2.4s, v6.4s, v2.4s
+        mul     v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
+        mul     v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
+        mul     v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
+        mls     v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
+        mla     v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
+        add     v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
+        sub     v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
+        neg     v2.8h, v3.8h            // -t4/2
+        neg     v6.8h, v4.8h            // -t3/2
+        ssra    v4.8h, v0.8h, #1        // (t1 + t3) >> 1
+        ssra    v2.8h, v1.8h, #1        // (t2 - t4) >> 1
+        ssra    v3.8h, v1.8h, #1        // (t2 + t4) >> 1
+        ssra    v6.8h, v0.8h, #1        // (t1 - t3) >> 1
+        srshr   v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
+        srshr   v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
+        srshr   v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
+        srshr   v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
+        uaddw   v0.8h, v0.8h, v5.8b
+        uaddw   v1.8h, v1.8h, v18.8b
+        uaddw   v2.8h, v2.8h, v22.8b
+        uaddw   v3.8h, v3.8h, v25.8b
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun  v3.8b, v3.8h
+        st1     {v0.8b}, [x3], x1
+        st1     {v1.8b}, [x3], x1
+        st1     {v2.8b}, [x3], x1
+        st1     {v3.8b}, [x3]
+        ret
+endfunc
+
+// VC-1 4x8 inverse transform
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x8_neon, export=1
+        mov     x3, #16
+        ldr     q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
+        mov     x4, x0
+        ld1     {v1.d}[0], [x2], x3     // 00 01 02 03
+        ld1     {v2.d}[0], [x2], x3     // 10 11 12 13
+        ld1     {v3.d}[0], [x2], x3     // 20 21 22 23
+        ld1     {v4.d}[0], [x2], x3     // 30 31 32 33
+        ld1     {v1.d}[1], [x2], x3     // 40 41 42 43
+        ld1     {v2.d}[1], [x2], x3     // 50 51 52 53
+        ld1     {v3.d}[1], [x2], x3     // 60 61 62 63
+        ld1     {v4.d}[1], [x2]         // 70 71 72 73
+        ld1     {v5.s}[0], [x0], x1
+        ld1     {v6.s}[0], [x0], x1
+        ld1     {v7.s}[0], [x0], x1
+        trn2    v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
+        trn1    v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
+        trn2    v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
+        trn1    v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
+        ld1     {v4.s}[0], [x0], x1
+        trn2    v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
+        trn1    v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
+        trn1    v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
+        mul     v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
+        ld1     {v5.s}[1], [x0], x1
+        mul     v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
+        ld1     {v6.s}[1], [x0], x1
+        trn2    v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
+        mul     v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
+        ld1     {v7.s}[1], [x0], x1
+        mul     v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
+        ld1     {v4.s}[1], [x0]
+        mla     v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
+        mls     v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
+        add     v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
+        sub     v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
+        neg     v3.8h, v16.8h           // -t3/2
+        ssra    v16.8h, v2.8h, #1       // (t1 + t3) >> 1
+        neg     v18.8h, v17.8h          // -t4/2
+        ssra    v17.8h, v1.8h, #1       // (t2 + t4) >> 1
+        ssra    v3.8h, v2.8h, #1        // (t1 - t3) >> 1
+        ssra    v18.8h, v1.8h, #1       // (t2 - t4) >> 1
+        srshr   v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
+        srshr   v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
+        srshr   v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
+        srshr   v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
+        trn2    v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
+        trn2    v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
+        trn1    v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
+        trn1    v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
+        trn1    v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
+        trn2    v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
+        trn1    v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
+        mov     d18, v3.d[1]            // 50 51 52 53
+        shl     v19.4h, v3.4h, #4       //          16 * src[8]
+        mov     d20, v16.d[1]           // 70 71 72 73
+        shl     v21.4h, v16.4h, #4      //                        16 * src[24]
+        mov     d22, v17.d[1]           // 40 41 42 43
+        shl     v23.4h, v3.4h, #2       //           4 * src[8]
+        shl     v24.4h, v18.4h, #4      //                                       16 * src[40]
+        shl     v25.4h, v20.4h, #4      //                                                      16 * src[56]
+        shl     v26.4h, v18.4h, #2      //                                        4 * src[40]
+        trn2    v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
+        ssra    v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
+        sub     v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
+        shl     v17.4h, v17.4h, #2      //         8/2 * src[0]
+        sub     v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
+        shl     v22.4h, v22.4h, #2      //                                      8/2 * src[32]
+        mov     d23, v1.d[1]            // 60 61 62 63
+        ssra    v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
+        mul     v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
+        shl     v1.4h, v1.4h, #3        //                      16/2 * src[16]
+        mls     v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
+        ssra    v17.4h, v17.4h, #1      //        12/2 * src[0]
+        mls     v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
+        ssra    v22.4h, v22.4h, #1      //                                     12/2 * src[32]
+        mla     v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
+        shl     v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
+        mla     v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
+        mla     v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
+        mla     v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
+        add     v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
+        sub     v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
+        sub     v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
+        mls     v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
+        mla     v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
+        add     v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
+        mls     v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
+        sub     v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
+        add     v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
+        sub     v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
+        neg     v23.4h, v24.4h          // +t2
+        sub     v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
+        add     v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
+        neg     v17.4h, v21.4h          // +t3
+        sub     v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
+        add     v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
+        neg     v16.4h, v19.4h          // -t1
+        neg     v27.4h, v2.4h           // +t4
+        ssra    v20.4h, v19.4h, #1      // (t5 + t1) >> 1
+        srsra   v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
+        ssra    v18.4h, v23.4h, #1      // (t6 + t2) >> 1
+        srsra   v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
+        ssra    v25.4h, v17.4h, #1      // (t7 + t3) >> 1
+        srsra   v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
+        ssra    v26.4h, v27.4h, #1      // (t8 + t4) >> 1
+        srsra   v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
+        trn1    v0.2d, v20.2d, v0.2d
+        trn1    v2.2d, v18.2d, v22.2d
+        trn1    v3.2d, v25.2d, v3.2d
+        trn1    v1.2d, v26.2d, v1.2d
+        srshr   v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
+        srshr   v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
+        srshr   v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
+        srshr   v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
+        uaddw   v0.8h, v0.8h, v5.8b
+        uaddw   v2.8h, v2.8h, v6.8b
+        uaddw   v3.8h, v3.8h, v7.8b
+        uaddw   v1.8h, v1.8h, v4.8b
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun  v3.8b, v3.8h
+        sqxtun  v1.8b, v1.8h
+        st1     {v0.s}[0], [x4], x1
+        st1     {v2.s}[0], [x4], x1
+        st1     {v3.s}[0], [x4], x1
+        st1     {v1.s}[0], [x4], x1
+        st1     {v0.s}[1], [x4], x1
+        st1     {v2.s}[1], [x4], x1
+        st1     {v3.s}[1], [x4], x1
+        st1     {v1.s}[1], [x4]
+        ret
+endfunc
+
+// VC-1 4x4 inverse transform
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x4_neon, export=1
+        mov     x3, #16
+        ldr     d0, .Lcoeffs_it4
+        mov     x4, x0
+        ld1     {v1.d}[0], [x2], x3     // 00 01 02 03
+        ld1     {v2.d}[0], [x2], x3     // 10 11 12 13
+        ld1     {v3.d}[0], [x2], x3     // 20 21 22 23
+        ld1     {v4.d}[0], [x2]         // 30 31 32 33
+        ld1     {v5.s}[0], [x0], x1
+        ld1     {v5.s}[1], [x0], x1
+        ld1     {v6.s}[0], [x0], x1
+        trn2    v7.4h, v1.4h, v2.4h     // 01 11 03 13
+        trn1    v1.4h, v1.4h, v2.4h     // 00 10 02 12
+        ld1     {v6.s}[1], [x0]
+        trn2    v2.4h, v3.4h, v4.4h     // 21 31 23 33
+        trn1    v3.4h, v3.4h, v4.4h     // 20 30 22 32
+        trn2    v4.2s, v7.2s, v2.2s     // 03 13 23 33
+        trn1    v16.2s, v1.2s, v3.2s    // 00 10 20 30
+        trn1    v2.2s, v7.2s, v2.2s     // 01 11 21 31
+        trn2    v1.2s, v1.2s, v3.2s     // 02 12 22 32
+        mul     v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
+        mul     v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
+        mul     v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
+        mul     v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
+        mla     v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
+        mls     v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
+        add     v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
+        sub     v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
+        neg     v7.4h, v3.4h            // -t3/2
+        neg     v16.4h, v4.4h           // -t4/2
+        ssra    v3.4h, v2.4h, #1        // (t1 + t3) >> 1
+        ssra    v4.4h, v1.4h, #1        // (t2 + t4) >> 1
+        ssra    v16.4h, v1.4h, #1       // (t2 - t4) >> 1
+        ssra    v7.4h, v2.4h, #1        // (t1 - t3) >> 1
+        srshr   v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
+        srshr   v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
+        srshr   v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
+        srshr   v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
+        trn2    v7.4h, v1.4h, v3.4h     // 10 11 30 31
+        trn1    v1.4h, v1.4h, v3.4h     // 00 01 20 21
+        trn2    v3.4h, v2.4h, v4.4h     // 12 13 32 33
+        trn1    v2.4h, v2.4h, v4.4h     // 02 03 22 23
+        trn2    v4.2s, v7.2s, v3.2s     // 30 31 32 33
+        trn1    v16.2s, v1.2s, v2.2s    // 00 01 02 03
+        trn1    v3.2s, v7.2s, v3.2s     // 10 11 12 13
+        trn2    v1.2s, v1.2s, v2.2s     // 20 21 22 23
+        mul     v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
+        mul     v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
+        mul     v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
+        mul     v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
+        mls     v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
+        mla     v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
+        add     v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
+        sub     v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
+        neg     v3.4h, v2.4h            // -t4/2
+        neg     v7.4h, v4.4h            // -t3/2
+        ssra    v4.4h, v0.4h, #1        // (t1 + t3) >> 1
+        ssra    v3.4h, v1.4h, #1        // (t2 - t4) >> 1
+        ssra    v2.4h, v1.4h, #1        // (t2 + t4) >> 1
+        ssra    v7.4h, v0.4h, #1        // (t1 - t3) >> 1
+        trn1    v0.2d, v4.2d, v3.2d
+        trn1    v1.2d, v2.2d, v7.2d
+        srshr   v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
+        srshr   v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
+        uaddw   v0.8h, v0.8h, v5.8b
+        uaddw   v1.8h, v1.8h, v6.8b
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        st1     {v0.s}[0], [x4], x1
+        st1     {v0.s}[1], [x4], x1
+        st1     {v1.s}[0], [x4], x1
+        st1     {v1.s}[1], [x4]
+        ret
+endfunc
+
+// VC-1 8x8 inverse transform, DC case
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x8_dc_neon, export=1
+        ldrsh   w2, [x2]
+        mov     x3, x0
+        ld1     {v0.8b}, [x0], x1
+        ld1     {v1.8b}, [x0], x1
+        ld1     {v2.8b}, [x0], x1
+        add     w2, w2, w2, lsl #1
+        ld1     {v3.8b}, [x0], x1
+        ld1     {v4.8b}, [x0], x1
+        add     w2, w2, #1
+        ld1     {v5.8b}, [x0], x1
+        asr     w2, w2, #1
+        ld1     {v6.8b}, [x0], x1
+        add     w2, w2, w2, lsl #1
+        ld1     {v7.8b}, [x0]
+        add     w0, w2, #16
+        asr     w0, w0, #5
+        dup     v16.8h, w0
+        uaddw   v0.8h, v16.8h, v0.8b
+        uaddw   v1.8h, v16.8h, v1.8b
+        uaddw   v2.8h, v16.8h, v2.8b
+        uaddw   v3.8h, v16.8h, v3.8b
+        uaddw   v4.8h, v16.8h, v4.8b
+        uaddw   v5.8h, v16.8h, v5.8b
+        sqxtun  v0.8b, v0.8h
+        uaddw   v6.8h, v16.8h, v6.8b
+        sqxtun  v1.8b, v1.8h
+        uaddw   v7.8h, v16.8h, v7.8b
+        sqxtun  v2.8b, v2.8h
+        sqxtun  v3.8b, v3.8h
+        sqxtun  v4.8b, v4.8h
+        st1     {v0.8b}, [x3], x1
+        sqxtun  v0.8b, v5.8h
+        st1     {v1.8b}, [x3], x1
+        sqxtun  v1.8b, v6.8h
+        st1     {v2.8b}, [x3], x1
+        sqxtun  v2.8b, v7.8h
+        st1     {v3.8b}, [x3], x1
+        st1     {v4.8b}, [x3], x1
+        st1     {v0.8b}, [x3], x1
+        st1     {v1.8b}, [x3], x1
+        st1     {v2.8b}, [x3]
+        ret
+endfunc
+
+// VC-1 8x4 inverse transform, DC case
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x4_dc_neon, export=1
+        ldrsh   w2, [x2]
+        mov     x3, x0
+        ld1     {v0.8b}, [x0], x1
+        ld1     {v1.8b}, [x0], x1
+        ld1     {v2.8b}, [x0], x1
+        add     w2, w2, w2, lsl #1
+        ld1     {v3.8b}, [x0]
+        add     w0, w2, #1
+        asr     w0, w0, #1
+        add     w0, w0, w0, lsl #4
+        add     w0, w0, #64
+        asr     w0, w0, #7
+        dup     v4.8h, w0
+        uaddw   v0.8h, v4.8h, v0.8b
+        uaddw   v1.8h, v4.8h, v1.8b
+        uaddw   v2.8h, v4.8h, v2.8b
+        uaddw   v3.8h, v4.8h, v3.8b
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun  v3.8b, v3.8h
+        st1     {v0.8b}, [x3], x1
+        st1     {v1.8b}, [x3], x1
+        st1     {v2.8b}, [x3], x1
+        st1     {v3.8b}, [x3]
+        ret
+endfunc
+
+// VC-1 4x8 inverse transform, DC case
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x8_dc_neon, export=1
+        ldrsh   w2, [x2]
+        mov     x3, x0
+        ld1     {v0.s}[0], [x0], x1
+        ld1     {v1.s}[0], [x0], x1
+        ld1     {v2.s}[0], [x0], x1
+        add     w2, w2, w2, lsl #4
+        ld1     {v3.s}[0], [x0], x1
+        add     w2, w2, #4
+        asr     w2, w2, #3
+        add     w2, w2, w2, lsl #1
+        ld1     {v0.s}[1], [x0], x1
+        add     w2, w2, #16
+        asr     w2, w2, #5
+        dup     v4.8h, w2
+        ld1     {v1.s}[1], [x0], x1
+        ld1     {v2.s}[1], [x0], x1
+        ld1     {v3.s}[1], [x0]
+        uaddw   v0.8h, v4.8h, v0.8b
+        uaddw   v1.8h, v4.8h, v1.8b
+        uaddw   v2.8h, v4.8h, v2.8b
+        uaddw   v3.8h, v4.8h, v3.8b
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun  v3.8b, v3.8h
+        st1     {v0.s}[0], [x3], x1
+        st1     {v1.s}[0], [x3], x1
+        st1     {v2.s}[0], [x3], x1
+        st1     {v3.s}[0], [x3], x1
+        st1     {v0.s}[1], [x3], x1
+        st1     {v1.s}[1], [x3], x1
+        st1     {v2.s}[1], [x3], x1
+        st1     {v3.s}[1], [x3]
+        ret
+endfunc
+
+// VC-1 4x4 inverse transform, DC case
+// On entry:
+//   x0 -> array of 8-bit samples, in row-major order
+//   x1 = row stride for 8-bit sample array
+//   x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+//   array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x4_dc_neon, export=1
+        ldrsh   w2, [x2]
+        mov     x3, x0
+        ld1     {v0.s}[0], [x0], x1
+        ld1     {v1.s}[0], [x0], x1
+        ld1     {v0.s}[1], [x0], x1
+        add     w2, w2, w2, lsl #4
+        ld1     {v1.s}[1], [x0]
+        add     w0, w2, #4
+        asr     w0, w0, #3
+        add     w0, w0, w0, lsl #4
+        add     w0, w0, #64
+        asr     w0, w0, #7
+        dup     v2.8h, w0
+        uaddw   v0.8h, v2.8h, v0.8b
+        uaddw   v1.8h, v2.8h, v1.8b
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        st1     {v0.s}[0], [x3], x1
+        st1     {v1.s}[0], [x3], x1
+        st1     {v0.s}[1], [x3], x1
+        st1     {v1.s}[1], [x3]
+        ret
+endfunc
+
 .align  5
+.Lcoeffs_it8:
+.quad   0x000F00090003
+.Lcoeffs_it4:
+.quad   0x0011000B0005
 .Lcoeffs:
 .quad   0x00050002
 
-- 
2.25.1



More information about the ffmpeg-devel mailing list