[FFmpeg-devel] Add ARM64 NEON optimization for HEVC decoder

Wed Jan 27 17:17:17 CET 2016

Add arm64 neon optimization for HEVC decoder, which have improved performance in large scale.


From c96995ea3bbfbbc42b7af7b447c8ada35f4b8a32 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:14:14 +0800
Subject: [PATCH 01/12] Create hevcdsp_idct_noen.s for aarch64


The file is used to optimize hevc idct transform for aarch64


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S | 1151 ++++++++++++++++++++++++++++++++
 1 file changed, 1151 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
new file mode 100644
index 0000000..2bc23c2
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+        trn1        \r8\().8H,  \r0\().8H,  \r1\().8H
+        trn2        \r9\().8H,  \r0\().8H,  \r1\().8H
+        trn1        \r1\().8H,  \r2\().8H,  \r3\().8H
+        trn2        \r3\().8H,  \r2\().8H,  \r3\().8H
+        trn1        \r0\().8H,  \r4\().8H,  \r5\().8H
+        trn2        \r5\().8H,  \r4\().8H,  \r5\().8H
+        trn1        \r2\().8H,  \r6\().8H,  \r7\().8H
+        trn2        \r7\().8H,  \r6\().8H,  \r7\().8H
+        trn1        \r4\().4S,  \r0\().4S,  \r2\().4S
+        trn2        \r2\().4S,  \r0\().4S,  \r2\().4S
+        trn1        \r6\().4S,  \r5\().4S,  \r7\().4S
+        trn2        \r7\().4S,  \r5\().4S,  \r7\().4S
+        trn1        \r5\().4S,  \r9\().4S,  \r3\().4S
+        trn2        \r9\().4S,  \r9\().4S,  \r3\().4S
+        trn1        \r3\().4S,  \r8\().4S,  \r1\().4S
+        trn2        \r8\().4S,  \r8\().4S,  \r1\().4S
+        trn1        \r0\().2D,  \r3\().2D,  \r4\().2D
+        trn2        \r4\().2D,  \r3\().2D,  \r4\().2D
+        trn1        \r1\().2D,  \r5\().2D,  \r6\().2D
+        trn2        \r5\().2D,  \r5\().2D,  \r6\().2D
+        trn2        \r6\().2D,  \r8\().2D,  \r2\().2D
+        trn1        \r2\().2D,  \r8\().2D,  \r2\().2D
+        trn1        \r3\().2D,  \r9\().2D,  \r7\().2D
+        trn2        \r7\().2D,  \r9\().2D,  \r7\().2D
+.endm
+
+.macro  transpose_16b_4x4  r0, r1, r2, r3, r4, r5, r6, r7
+        trn1        \r4\().4H,  \r0\().4H,  \r1\().4H
+        trn2        \r5\().4H,  \r0\().4H,  \r1\().4H
+        trn1        \r7\().4H,  \r2\().4H,  \r3\().4H
+        trn2        \r6\().4H,  \r2\().4H,  \r3\().4H
+        trn1        \r0\().2S,  \r4\().2S,  \r7\().2S
+        trn2        \r2\().2S,  \r4\().2S,  \r7\().2S
+        trn1        \r1\().2S,  \r5\().2S,  \r6\().2S
+        trn2        \r3\().2S,  \r5\().2S,  \r6\().2S
+.endm
+
+.macro tr4_luma_shift r0, r1, shift
+        saddl       v5.4S, \r0\().4H, \r1\().4H	
+        trn2        v1.2D, \r1\().2D, \r1\().2D
+        saddl       v2.4S, \r1\().4H, v1.4H
+        ssubl       v4.4S, \r0\().4H, v1.4H
+        smull2      v6.4S, \r0\().8H, v0.H[0]
+        saddl       v7.4S, \r0\().4H, v1.4H
+        ssubw       v7.4S, v7.4S, \r1\().4H
+        mul         v7.4S, v7.4S, v0.S[0]
+        mul         v8.4S, v5.4S, v0.S[1]
+        mul         v9.4S, v2.4S, v0.S[2]
+        add         v8.4S, v8.4S, v9.4S
+        add         v8.4S, v8.4S, v6.4S
+        mul         v2.4S, v2.4S, v0.S[1]
+        mul         v9.4S, v4.4S, v0.S[2]
+        sub         v9.4S, v9.4S, v2.4S
+        add         v9.4S, v9.4S, v6.4S
+        mul         v5.4S, v5.4S, v0.S[2]
+        mul         v4.4S, v4.4S, v0.S[1]
+        add         v5.4S, v5.4S, v4.4S
+        sub         v5.4S, v5.4S, v6.4S
+        sqrshrn     \r0\().4H, v8.4S, \shift
+        sqrshrn     \r1\().4H, v9.4S, \shift
+        sqrshrn2    \r0\().8H, v7.4S, \shift
+        sqrshrn2    \r1\().8H, v5.4S, \shift
+.endm
+
+.macro tr4 r0, r1, r2, r3
+        smull       v4.4S, \r1\().4H, v0.H[0]
+        smull       v6.4S, \r1\().4H, v0.H[1]
+        sshll       v2.4S, \r0\().4H, #6
+        sshll       v3.4S, \r2\().4H, #6
+        add         v5.4S, v2.4S, v3.4S
+        sub         v2.4S, v2.4S, v3.4S
+        smlal       v4.4S, \r3\().4H, v0.H[1]
+        smlsl       v6.4S, \r3\().4H, v0.H[0]
+        sub         v3.4S, v5.4S, v4.4S
+        add         v4.4S, v5.4S, v4.4S
+        add         v5.4S, v2.4S, v6.4S
+        sub         v6.4S, v2.4S, v6.4S
+.endm
+
+.macro tr4_shift r0, r1, shift
+        smull2      v4.4S, \r0\().8H, v0.H[0]
+        smull2      v6.4S, \r0\().8H, v0.H[1]
+        sshll       v2.4S, \r0\().4H, #6
+        sshll       v3.4S, \r1\().4H, #6
+        add         v5.4S, v2.4S, v3.4S
+        sub         v2.4S, v2.4S, v3.4S
+        smlal2      v4.4S, \r1\().8H, v0.H[1]
+        smlsl2      v6.4S, \r1\().8H, v0.H[0]
+        sub         v3.4S, v5.4S, v4.4S
+        add         v4.4S, v5.4S, v4.4S
+        add         v5.4S, v2.4S, v6.4S
+        sub         v6.4S, v2.4S, v6.4S
+        sqrshrn     \r0\().4H, v4.4S, \shift
+        sqrshrn     \r1\().4H, v5.4S, \shift
+        sqrshrn2    \r0\().8H, v6.4S, \shift
+        sqrshrn2    \r1\().8H, v3.4S, \shift
+.endm
+
+.macro tr8_begin in0, in1, in2, in3
+        smull       v7.4S, \in0\().4H, v0.H[5]
+        smull       v8.4S, \in0\().4H, v0.H[4]
+        smull       v9.4S, \in0\().4H, v0.H[7]
+        smull       v10.4S, \in0\().4H, v0.H[6]
+        smlal       v7.4S, \in1\().4H, v0.H[4]
+        smlsl       v8.4S, \in1\().4H, v0.H[6]
+        smlsl       v9.4S, \in1\().4H, v0.H[5]
+        smlsl       v10.4S, \in1\().4H, v0.H[7]
+        smlal       v7.4S, \in2\().4H, v0.H[7]
+        smlsl       v8.4S, \in2\().4H, v0.H[5]
+        smlal       v9.4S, \in2\().4H, v0.H[6]
+        smlal       v10.4S, \in2\().4H, v0.H[4]
+        smlal       v7.4S, \in3\().4H, v0.H[6]
+        smlsl       v8.4S, \in3\().4H, v0.H[7]
+        smlal       v9.4S, \in3\().4H, v0.H[4]
+        smlsl       v10.4S, \in3\().4H, v0.H[5]
+.endm
+
+.macro tr8_end shift
+        add         v1.4S, v4.4S, v7.4S
+        sub         v4.4S, v4.4S, v7.4S
+        add         v2.4S, v5.4S, v8.4S
+        sub         v5.4S, v5.4S, v8.4S
+        add         v11.4S, v6.4S, v9.4S 
+        sub         v6.4S, v6.4S, v9.4S
+        add         v12.4S, v3.4S, v10.4S
+        sub         v3.4S, v3.4S, v10.4S
+        sqrshrn     v9.4H, v4.4S, \shift
+        sqrshrn     v8.4H, v5.4S, \shift
+        sqrshrn     v7.4H, v6.4S, \shift
+        sqrshrn     v6.4H, v3.4S, \shift
+        sqrshrn     v5.4H, v12.4S, \shift
+        sqrshrn     v4.4H, v11.4S, \shift
+        sqrshrn     v3.4H, v2.4S, \shift
+        sqrshrn     v2.4H, v1.4S, \shift
+.endm
+
+.macro tr8_end_0
+        sub         v15.4S, v4.4S, v7.4S
+        sub         v14.4S, v5.4S, v8.4S
+        sub         v13.4S, v6.4S, v9.4S
+        sub         v12.4S, v3.4S, v10.4S
+        add         v11.4S, v3.4S, v10.4S
+        add         v10.4S, v6.4S, v9.4S
+        add         v9.4S,  v5.4S, v8.4S
+        add         v8.4S,  v4.4S, v7.4S
+.endm
+
+.macro tr16_begin in0, in1, in2, in3, in4, in5, in6, in7
+        smull       v2.4S,  \in0\().4H, v1.H[1]
+        smull       v3.4S,  \in0\().4H, v1.H[0]
+        smull       v4.4S,  \in0\().4H, v1.H[3]
+        smull       v5.4S,  \in0\().4H, v1.H[2]
+        smull       v6.4S,  \in0\().4H, v1.H[5]
+        smull       v7.4S,  \in0\().4H, v1.H[4]
+        smull       v8.4S,  \in0\().4H, v1.H[7]
+        smull       v9.4S,  \in0\().4H, v1.H[6]
+        smlal       v2.4S,  \in1\().4H, v1.H[0]
+        smlal       v3.4S,  \in1\().4H, v1.H[5]
+        smlal       v4.4S,  \in1\().4H, v1.H[6]
+        smlsl       v5.4S,  \in1\().4H, v1.H[4]
+        smlsl       v6.4S,  \in1\().4H, v1.H[3]
+        smlsl       v7.4S,  \in1\().4H, v1.H[1]
+        smlsl       v8.4S,  \in1\().4H, v1.H[2]
+        smlsl       v9.4S,  \in1\().4H, v1.H[7]
+        smlal       v2.4S,  \in2\().4H, v1.H[3]
+        smlal       v3.4S,  \in2\().4H, v1.H[6]
+        smlsl       v4.4S,  \in2\().4H, v1.H[2]
+        smlsl       v5.4S,  \in2\().4H, v1.H[0]
+        smlsl       v6.4S,  \in2\().4H, v1.H[7]
+        smlal       v7.4S,  \in2\().4H, v1.H[5]
+        smlal       v8.4S,  \in2\().4H, v1.H[1]
+        smlal       v9.4S,  \in2\().4H, v1.H[4]
+        smlal       v2.4S,  \in3\().4H, v1.H[2]
+        smlsl       v3.4S,  \in3\().4H, v1.H[4]
+        smlsl       v4.4S,  \in3\().4H, v1.H[0]
+        smlal       v5.4S,  \in3\().4H, v1.H[6]
+        smlal       v6.4S,  \in3\().4H, v1.H[1]
+        smlal       v7.4S,  \in3\().4H, v1.H[7]
+        smlsl       v8.4S,  \in3\().4H, v1.H[3]
+        smlsl       v9.4S,  \in3\().4H, v1.H[5]
+        smlal       v2.4S,  \in4\().4H, v1.H[5]
+        smlsl       v3.4S,  \in4\().4H, v1.H[3]
+        smlsl       v4.4S,  \in4\().4H, v1.H[7]
+        smlal       v5.4S,  \in4\().4H, v1.H[1]
+        smlsl       v6.4S,  \in4\().4H, v1.H[6]
+        smlsl       v7.4S,  \in4\().4H, v1.H[0]
+        smlal       v8.4S,  \in4\().4H, v1.H[4]
+        smlal       v9.4S,  \in4\().4H, v1.H[2]
+        smlal       v2.4S,  \in5\().4H, v1.H[4]
+        smlsl       v3.4S,  \in5\().4H, v1.H[1]
+        smlal       v4.4S,  \in5\().4H, v1.H[5]
+        smlal       v5.4S,  \in5\().4H, v1.H[7]
+        smlsl       v6.4S,  \in5\().4H, v1.H[0]
+        smlal       v7.4S,  \in5\().4H, v1.H[2]
+        smlal       v8.4S,  \in5\().4H, v1.H[6]
+        smlsl       v9.4S,  \in5\().4H, v1.H[3]
+        smlal       v2.4S,  \in6\().4H, v1.H[7]
+        smlsl       v3.4S,  \in6\().4H, v1.H[2]
+        smlal       v4.4S,  \in6\().4H, v1.H[1]
+        smlsl       v5.4S,  \in6\().4H, v1.H[3]
+        smlal       v6.4S,  \in6\().4H, v1.H[4]
+        smlal       v7.4S,  \in6\().4H, v1.H[6]
+        smlsl       v8.4S,  \in6\().4H, v1.H[5]
+        smlal       v9.4S,  \in6\().4H, v1.H[0]
+        smlal       v2.4S,  \in7\().4H, v1.H[6]
+        smlsl       v3.4S,  \in7\().4H, v1.H[7]
+        smlal       v4.4S,  \in7\().4H, v1.H[4]
+        smlsl       v5.4S,  \in7\().4H, v1.H[5]
+        smlal       v6.4S,  \in7\().4H, v1.H[2]
+        smlsl       v7.4S,  \in7\().4H, v1.H[3]
+        smlal       v8.4S,  \in7\().4H, v1.H[0]
+        smlsl       v9.4S,  \in7\().4H, v1.H[1]
+.endm
+
+.macro  tr32_begin
+        smull       v6.4S,  v16.4H, v0.H[0]
+        smull       v7.4S,  v16.4H, v0.H[0]
+        smull       v8.4S,  v16.4H, v0.H[2]
+        smull       v9.4S,  v16.4H, v0.H[3]
+        smlal       v6.4S,  v17.4H, v0.H[0]
+        smlal       v7.4S,  v17.4H, v0.H[4]
+        smlal       v8.4S,  v17.4H, v0.H[7]
+        smlal       v9.4S,  v17.4H, v1.H[2]
+        smlal       v6.4S,  v18.4H, v0.H[2]
+        smlal       v7.4S,  v18.4H, v0.H[7]
+        smlal       v8.4S,  v18.4H, v1.H[4]
+        smlsl       v9.4S,  v18.4H, v1.H[6]
+        smlal       v6.4S,  v19.4H, v0.H[3]
+        smlal       v7.4S,  v19.4H, v1.H[2]
+        smlsl       v8.4S,  v19.4H, v1.H[6]
+        smlsl       v9.4S,  v19.4H, v0.H[7]
+        smlal       v6.4S,  v20.4H, v0.H[4]
+        smlal       v7.4S,  v20.4H, v1.H[5]
+        smlsl       v8.4S,  v20.4H, v1.H[1]
+        smlsl       v9.4S,  v20.4H, v0.H[0]
+        smlal       v6.4S,  v21.4H, v0.H[5]
+        smlsl       v7.4S,  v21.4H, v1.H[7]
+        smlsl       v8.4S,  v21.4H, v0.H[4]
+        smlsl       v9.4S,  v21.4H, v0.H[6]
+        smlal       v6.4S,  v22.4H, v0.H[6]
+        smlsl       v7.4S,  v22.4H, v1.H[4]
+        smlsl       v8.4S,  v22.4H, v0.H[0]
+        smlsl       v9.4S,  v22.4H, v1.H[5]
+        smlal       v6.4S,  v23.4H, v0.H[7]
+        smlsl       v7.4S,  v23.4H, v1.H[1]
+        smlsl       v8.4S,  v23.4H, v0.H[5]
+        smlal       v9.4S,  v23.4H, v1.H[3]
+        smlal       v6.4S,  v24.4H, v1.H[0]
+        smlsl       v7.4S,  v24.4H, v0.H[6]
+        smlsl       v8.4S,  v24.4H, v1.H[2]
+        smlal       v9.4S,  v24.4H, v0.H[4]
+        smlal       v6.4S,  v25.4H, v1.H[1]
+        smlsl       v7.4S,  v25.4H, v0.H[3]
+        smlsl       v8.4S,  v25.4H, v1.H[7]
+        smlal       v9.4S,  v25.4H, v0.H[2]
+        smlal       v6.4S,  v26.4H, v1.H[2]
+        smlsl       v7.4S,  v26.4H, v0.H[0]
+        smlal       v8.4S,  v26.4H, v1.H[3]
+        smlal       v9.4S,  v26.4H, v1.H[1]
+        smlal       v6.4S,  v27.4H, v1.H[3]
+        smlsl       v7.4S,  v27.4H, v0.H[2]
+        smlal       v8.4S,  v27.4H, v0.H[6]
+        smlsl       v9.4S,  v27.4H, v1.H[7]
+        smlal       v6.4S,  v28.4H, v1.H[4]
+        smlsl       v7.4S,  v28.4H, v0.H[5]
+        smlal       v8.4S,  v28.4H, v0.H[0]
+        smlsl       v9.4S,  v28.4H, v1.H[0]
+        smlal       v6.4S,  v29.4H, v1.H[5]
+        smlsl       v7.4S,  v29.4H, v1.H[0]
+        smlal       v8.4S,  v29.4H, v0.H[3]
+        smlsl       v9.4S,  v29.4H, v0.H[0]
+        smlal       v6.4S,  v30.4H, v1.H[6]
+        smlsl       v7.4S,  v30.4H, v1.H[3]
+        smlal       v8.4S,  v30.4H, v1.H[0]
+        smlsl       v9.4S,  v30.4H, v0.H[5]
+        smlal       v6.4S,  v31.4H, v1.H[7]
+        smlsl       v7.4S,  v31.4H, v1.H[6]
+        smlal       v8.4S,  v31.4H, v1.H[5]
+        smlsl       v9.4S,  v31.4H, v1.H[0]
+        smull       v2.4S,  v16.4H, v0.H[4]
+        smull       v3.4S,  v16.4H, v0.H[5]
+        smull       v4.4S,  v16.4H, v0.H[6]
+        smull       v5.4S,  v16.4H, v0.H[7]
+        smlal       v2.4S,  v17.4H, v1.H[5]
+        smlsl       v3.4S,  v17.4H, v1.H[7]
+        smlsl       v4.4S,  v17.4H, v1.H[4]
+        smlsl       v5.4S,  v17.4H, v1.H[1]
+        smlsl       v2.4S,  v18.4H, v1.H[1]
+        smlsl       v3.4S,  v18.4H, v0.H[4]
+        smlsl       v4.4S,  v18.4H, v0.H[0]
+        smlsl       v5.4S,  v18.4H, v0.H[5]
+        smlsl       v2.4S,  v19.4H, v0.H[0]
+        smlsl       v3.4S,  v19.4H, v0.H[6]
+        smlsl       v4.4S,  v19.4H, v1.H[5]
+        smlal       v5.4S,  v19.4H, v1.H[3]
+        smlsl       v2.4S,  v20.4H, v1.H[0]
+        smlal       v3.4S,  v20.4H, v1.H[6]
+        smlal       v4.4S,  v20.4H, v0.H[5]
+        smlal       v5.4S,  v20.4H, v0.H[3]
+        smlal       v2.4S,  v21.4H, v1.H[6]
+        smlal       v3.4S,  v21.4H, v0.H[3]
+        smlal       v4.4S,  v21.4H, v0.H[7]
+        smlsl       v5.4S,  v21.4H, v1.H[5]
+        smlal       v2.4S,  v22.4H, v0.H[5]
+        smlal       v3.4S,  v22.4H, v0.H[7]
+        smlsl       v4.4S,  v22.4H, v1.H[3]
+        smlsl       v5.4S,  v22.4H, v0.H[0]
+        smlal       v2.4S,  v23.4H, v0.H[3]
+        smlsl       v3.4S,  v23.4H, v1.H[5]
+        smlsl       v4.4S,  v23.4H, v0.H[0]
+        smlal       v5.4S,  v23.4H, v1.H[7]
+        smlal       v2.4S,  v24.4H, v1.H[4]
+        smlsl       v3.4S,  v24.4H, v0.H[2]
+        smlsl       v4.4S,  v24.4H, v1.H[6]
+        smlal       v5.4S,  v24.4H, v0.H[0]
+        smlsl       v2.4S,  v25.4H, v1.H[2]
+        smlsl       v3.4S,  v25.4H, v1.H[0]
+        smlal       v4.4S,  v25.4H, v0.H[4]
+        smlal       v5.4S,  v25.4H, v1.H[6]
+        smlsl       v2.4S,  v26.4H, v0.H[0] 
+        smlal       v3.4S,  v26.4H, v1.H[4]
+        smlal       v4.4S,  v26.4H, v1.H[0]
+        smlsl       v5.4S,  v26.4H, v0.H[2]
+        smlsl       v2.4S,  v27.4H, v0.H[7]
+        smlal       v3.4S,  v27.4H, v0.H[0]
+        smlsl       v4.4S,  v27.4H, v1.H[2]
+        smlsl       v5.4S,  v27.4H, v1.H[4]
+        smlal       v2.4S,  v28.4H, v1.H[7]
+        smlal       v3.4S,  v28.4H, v1.H[1]
+        smlsl       v4.4S,  v28.4H, v0.H[2]
+        smlal       v5.4S,  v28.4H, v0.H[4]
+        smlal       v2.4S,  v29.4H, v0.H[6]
+        smlsl       v3.4S,  v29.4H, v1.H[3]
+        smlsl       v4.4S,  v29.4H, v1.H[7]
+        smlal       v5.4S,  v29.4H, v1.H[2]
+        smlal       v2.4S,  v30.4H, v0.H[2]
+        smlsl       v3.4S,  v30.4H, v0.H[0]
+        smlal       v4.4S,  v30.4H, v0.H[3]
+        smlsl       v5.4S,  v30.4H, v0.H[6]
+        smlal       v2.4S,  v31.4H, v1.H[3]
+        smlsl       v3.4S,  v31.4H, v1.H[2]
+        smlal       v4.4S,  v31.4H, v1.H[1]
+        smlsl       v5.4S,  v31.4H, v1.H[0]
+        st1         {v6.2D - v9.2D}, [x6], #64
+        st1         {v2.2D - v5.2D}, [x6], #64
+
+        smull       v6.4S,  v16.4H, v1.H[0]
+        smull       v7.4S,  v16.4H, v1.H[1]
+        smull       v8.4S,  v16.4H, v1.H[2]
+        smull       v9.4S,  v16.4H, v1.H[3]
+        smlsl       v6.4S,  v17.4H, v0.H[6]
+        smlsl       v7.4S,  v17.4H, v0.H[3]
+        smlsl       v8.4S,  v17.4H, v0.H[0]
+        smlsl       v9.4S,  v17.4H, v0.H[2]
+        smlsl       v6.4S,  v18.4H, v1.H[2]
+        smlsl       v7.4S,  v18.4H, v1.H[7]
+        smlal       v8.4S,  v18.4H, v1.H[3]
+        smlal       v9.4S,  v18.4H, v0.H[6]
+        smlal       v6.4S,  v19.4H, v0.H[4]
+        smlal       v7.4S,  v19.4H, v0.H[2]
+        smlal       v8.4S,  v19.4H, v1.H[1]
+        smlsl       v9.4S,  v19.4H, v1.H[7]
+        smlal       v6.4S,  v20.4H, v1.H[4]
+        smlsl       v7.4S,  v20.4H, v1.H[2]
+        smlsl       v8.4S,  v20.4H, v0.H[0]
+        smlsl       v9.4S,  v20.4H, v0.H[7]
+        smlsl       v6.4S,  v21.4H, v0.H[2]
+        smlsl       v7.4S,  v21.4H, v1.H[0]
+        smlal       v8.4S,  v21.4H, v1.H[4]
+        smlal       v9.4S,  v21.4H, v0.H[0]
+        smlsl       v6.4S,  v22.4H, v1.H[6]
+        smlal       v7.4S,  v22.4H, v0.H[4]
+        smlal       v8.4S,  v22.4H, v1.H[0]
+        smlsl       v9.4S,  v22.4H, v1.H[2]
+        smlal       v6.4S,  v23.4H, v0.H[0]
+        smlal       v7.4S,  v23.4H, v1.H[6]
+        smlsl       v8.4S,  v23.4H, v0.H[2]
+        smlal       v9.4S,  v23.4H, v1.H[4]
+        smlsl       v6.4S,  v24.4H, v1.H[7]
+        smlsl       v7.4S,  v24.4H, v0.H[0]
+        smlal       v8.4S,  v24.4H, v1.H[5]
+        smlal       v9.4S,  v24.4H, v0.H[3]
+        smlsl       v6.4S,  v25.4H, v0.H[0]
+        smlal       v7.4S,  v25.4H, v1.H[3]
+        smlal       v8.4S,  v25.4H, v0.H[7]
+        smlsl       v9.4S,  v25.4H, v0.H[5]
+        smlal       v6.4S,  v26.4H, v1.H[5]
+        smlal       v7.4S,  v26.4H, v0.H[7]
+        smlsl       v8.4S,  v26.4H, v0.H[3]
+        smlal       v9.4S,  v26.4H, v1.H[6]
+        smlal       v6.4S,  v27.4H, v0.H[3]
+        smlsl       v7.4S,  v27.4H, v0.H[5]
+        smlal       v8.4S,  v27.4H, v1.H[6]
+        smlal       v9.4S,  v27.4H, v1.H[0]
+        smlsl       v6.4S,  v28.4H, v1.H[3]
+        smlsl       v7.4S,  v28.4H, v1.H[5]
+        smlal       v8.4S,  v28.4H, v0.H[6]
+        smlsl       v9.4S,  v28.4H, v0.H[0]
+        smlsl       v6.4S,  v29.4H, v0.H[5]
+        smlal       v7.4S,  v29.4H, v0.H[0]
+        smlsl       v8.4S,  v29.4H, v0.H[4]
+        smlal       v9.4S,  v29.4H, v1.H[1]
+        smlal       v6.4S,  v30.4H, v1.H[1]
+        smlsl       v7.4S,  v30.4H, v1.H[4]
+        smlal       v8.4S,  v30.4H, v1.H[7]
+        smlal       v9.4S,  v30.4H, v1.H[5]
+        smlal       v6.4S,  v31.4H, v0.H[7]
+        smlsl       v7.4S,  v31.4H, v0.H[6]
+        smlal       v8.4S,  v31.4H, v0.H[5]
+        smlsl       v9.4S,  v31.4H, v0.H[4] 
+        smull       v2.4S,  v16.4H, v1.H[4]
+        smull       v3.4S,  v16.4H, v1.H[5]
+        smull       v4.4S,  v16.4H, v1.H[6]
+        smull       v5.4S,  v16.4H, v1.H[7]
+        smlsl       v2.4S,  v17.4H, v0.H[5]
+        smlsl       v3.4S,  v17.4H, v1.H[0]
+        smlsl       v4.4S,  v17.4H, v1.H[3]
+        smlsl       v5.4S,  v17.4H, v1.H[6]
+        smlal       v2.4S,  v18.4H, v0.H[0]
+        smlal       v3.4S,  v18.4H, v0.H[3]
+        smlal       v4.4S,  v18.4H, v1.H[0]
+        smlal       v5.4S,  v18.4H, v1.H[5]
+        smlsl       v2.4S,  v19.4H, v1.H[0]
+        smlsl       v3.4S,  v19.4H, v0.H[0]
+        smlsl       v4.4S,  v19.4H, v0.H[5]
+        smlsl       v5.4S,  v19.4H, v1.H[4]
+        smlal       v2.4S,  v20.4H, v1.H[7]
+        smlal       v3.4S,  v20.4H, v0.H[6]
+        smlal       v4.4S,  v20.4H, v0.H[2]
+        smlal       v5.4S,  v20.4H, v1.H[3]
+        smlal       v2.4S,  v21.4H, v1.H[1]
+        smlsl       v3.4S,  v21.4H, v1.H[3]
+        smlsl       v4.4S,  v21.4H, v0.H[0]
+        smlsl       v5.4S,  v21.4H, v1.H[2]
+        smlsl       v2.4S,  v22.4H, v0.H[2]
+        smlsl       v3.4S,  v22.4H, v1.H[7]
+        smlal       v4.4S,  v22.4H, v0.H[3]
+        smlal       v5.4S,  v22.4H, v1.H[1]
+        smlal       v2.4S,  v23.4H, v0.H[4]
+        smlal       v3.4S,  v23.4H, v1.H[2]
+        smlsl       v4.4S,  v23.4H, v0.H[6]
+        smlsl       v5.4S,  v23.4H, v1.H[0]
+        smlsl       v2.4S,  v24.4H, v1.H[3]
+        smlsl       v3.4S,  v24.4H, v0.H[5]
+        smlal       v4.4S,  v24.4H, v1.H[1]
+        smlal       v5.4S,  v24.4H, v0.H[7]
+        smlsl       v2.4S,  v25.4H, v1.H[5]
+        smlal       v3.4S,  v25.4H, v0.H[0]
+        smlsl       v4.4S,  v25.4H, v1.H[4]
+        smlsl       v5.4S,  v25.4H, v0.H[6]
+        smlal       v2.4S,  v26.4H, v0.H[6]
+        smlsl       v3.4S,  v26.4H, v0.H[4]
+        smlal       v4.4S,  v26.4H, v1.H[7]
+        smlal       v5.4S,  v26.4H, v0.H[5]
+        smlsl       v2.4S,  v27.4H, v0.H[0]
+        smlal       v3.4S,  v27.4H, v1.H[1]
+        smlal       v4.4S,  v27.4H, v1.H[5]
+        smlsl       v5.4S,  v27.4H, v0.H[4]
+        smlal       v2.4S,  v28.4H, v0.H[7]
+        smlsl       v3.4S,  v28.4H, v1.H[6]
+        smlsl       v4.4S,  v28.4H, v1.H[2]
+        smlal       v5.4S,  v28.4H, v0.H[3]
+        smlsl       v2.4S,  v29.4H, v1.H[6]
+        smlsl       v3.4S,  v29.4H, v1.H[4]
+        smlal       v4.4S,  v29.4H, v0.H[7]
+        smlsl       v5.4S,  v29.4H, v0.H[2]
+        smlsl       v2.4S,  v30.4H, v1.H[2]
+        smlal       v3.4S,  v30.4H, v0.H[7]
+        smlsl       v4.4S,  v30.4H, v0.H[4]
+        smlal       v5.4S,  v30.4H, v0.H[0]
+        smlal       v2.4S,  v31.4H, v0.H[3]
+        smlsl       v3.4S,  v31.4H, v0.H[2]
+        smlal       v4.4S,  v31.4H, v0.H[0]
+        smlsl       v5.4S,  v31.4H, v0.H[0]
+        st1         {v6.2D - v9.2D}, [x6], #64
+        st1         {v2.2D - v5.2D}, [x6], #64
+.endm
+
+function ff_hevc_transform_8x8_neon_8, export=1
+        movrel      x3, trans_coeff
+        mov         x5, #16
+        mov         x6, x0
+        ld1         {v0.2D}, [x3]
+        ld1         {v24.1D}, [x0], x5
+        ld1         {v25.1D}, [x0], x5
+        ld1         {v26.1D}, [x0], x5
+        ld1         {v27.1D}, [x0], x5
+        ld1         {v28.1D}, [x0], x5
+        ld1         {v29.1D}, [x0], x5
+        ld1         {v30.1D}, [x0], x5
+        ld1         {v31.1D}, [x0], x5
+        mov         x0, x6
+        tr8_begin   v25, v27, v29, v31
+        tr4         v24, v26, v28, v30
+        tr8_end     #7
+        st1         {v2.1D}, [x0], x5
+        st1         {v3.1D}, [x0], x5
+        st1         {v4.1D}, [x0], x5
+        st1         {v5.1D}, [x0], x5
+        st1         {v6.1D}, [x0], x5
+        st1         {v7.1D}, [x0], x5
+        st1         {v8.1D}, [x0], x5
+        st1         {v9.1D}, [x0], x5
+        mov         x0, x6
+        cmp         x1, #4
+        b.lt        1f
+        add         x0, x0, #8
+        ld1         {v24.1D}, [x0], x5
+        ld1         {v25.1D}, [x0], x5
+        ld1         {v26.1D}, [x0], x5
+        ld1         {v27.1D}, [x0], x5
+        ld1         {v28.1D}, [x0], x5
+        ld1         {v29.1D}, [x0], x5
+        ld1         {v30.1D}, [x0], x5
+        ld1         {v31.1D}, [x0], x5
+        sub         x0, x0, #128
+        tr8_begin   v25, v27, v29, v31
+        tr4         v24, v26, v28, v30
+        tr8_end     #7
+        st1         {v2.1D}, [x0], x5
+        st1         {v3.1D}, [x0], x5
+        st1         {v4.1D}, [x0], x5
+        st1         {v5.1D}, [x0], x5
+        st1         {v6.1D}, [x0], x5
+        st1         {v7.1D}, [x0], x5
+        st1         {v8.1D}, [x0], x5
+        st1         {v9.1D}, [x0], x5
+        mov         x0, x6
+1:      ld1         {v24.1D - v27.1D}, [x0], #32
+        ld1         {v28.1D - v31.1D}, [x0]
+        mov         x0, x6
+        transpose_16b_4x4 v24, v26, v28, v30, v11, v12, v13, v14
+        transpose_16b_4x4 v25, v27, v29, v31, v11, v12, v13, v14
+        tr8_begin   v26, v30, v27, v31
+        tr4         v24, v28, v25, v29
+        tr8_end     #12
+        transpose_16b_4x4 v2, v3, v4, v5, v11, v12, v13, v14
+        transpose_16b_4x4 v6, v7, v8, v9, v11, v12, v13, v14
+        zip1        v11.2D, v2.2D, v6.2D
+        zip1        v12.2D, v3.2D, v7.2D
+        zip1        v13.2D, v4.2D, v8.2D
+        zip1        v14.2D, v5.2D, v9.2D
+        st1         {v11.2D - v14.2D}, [x0], #64
+        ld1         {v24.4H - v27.4H}, [x0], #32
+        ld1         {v28.4H - v31.4H}, [x0]
+        sub         x0, x0, #32
+        transpose_16b_4x4 v24, v26, v28, v30, v11, v12, v13, v14
+        transpose_16b_4x4 v25, v27, v29, v31, v11, v12, v13, v14
+        tr8_begin   v26, v30, v27, v31
+        tr4         v24, v28, v25, v29
+        tr8_end     #12 
+        transpose_16b_4x4 v2, v3, v4, v5, v11, v12, v13, v14
+        transpose_16b_4x4 v6, v7, v8, v9, v11, v12, v13, v14
+        zip1        v11.2D, v2.2D, v6.2D
+        zip1        v12.2D, v3.2D, v7.2D
+        zip1        v13.2D, v4.2D, v8.2D
+        zip1        v14.2D, v5.2D, v9.2D
+        st1         {v11.2D - v12.2D}, [x0], #32
+        st1         {v13.2D - v14.2D}, [x0], #32
+        sub         x0, x0, #64
+        ret
+endfunc
+
+function ff_hevc_transform_16x16_neon_8, export=1
+        mov         x5, #32
+        lsr         x6, x5, #1
+        add         x7, x1, #4
+        cmp         x1, #12
+        csel        x1, x6, x7, gt
+        movrel      x3, trans_coeff
+        mov         x10, sp
+        mov         x4, XZR
+0:      ld1         {v0.2D - v1.2D}, [x3]
+        add         x0, x0, x5
+        lsl         x5, x5, #1
+        ld1         {v24.1D}, [x0], x5
+        ld1         {v25.1D}, [x0], x5
+        ld1         {v26.1D}, [x0], x5
+        ld1         {v27.1D}, [x0], x5
+        ld1         {v28.1D}, [x0], x5
+        ld1         {v29.1D}, [x0], x5
+        ld1         {v30.1D}, [x0], x5
+        ld1         {v31.1D}, [x0], x5
+        sub         x0, x0, x5, lsl #3
+        sub         x0, x0, x5, lsr #1
+        tr16_begin  v24, v25, v26, v27, v28, v29, v30, v31
+        sub         x10, x10, #128
+        st1         {v2.2D - v5.2D}, [x10], #64
+        st1         {v6.2D - v9.2D}, [x10], #64
+        ld1         {v24.1D}, [x0], x5
+        ld1         {v25.1D}, [x0], x5
+        ld1         {v26.1D}, [x0], x5
+        ld1         {v27.1D}, [x0], x5
+        ld1         {v28.1D}, [x0], x5
+        ld1         {v29.1D}, [x0], x5
+        ld1         {v30.1D}, [x0], x5
+        ld1         {v31.1D}, [x0], x5
+        sub         x0, x0, x5, lsl #3
+        lsr         x5, x5, #1
+        tr8_begin   v25, v27, v29, v31
+        tr4         v24, v26, v28, v30
+        tr8_end_0 
+        sub         x10, x10, #128
+        ld1         {v0.2D - v3.2D}, [x10], #64
+        add         v4.4S, v8.4S, v0.4S
+        sub         v8.4S, v8.4S, v0.4S
+        sqrshrn     v5.4H, v4.4S, #7
+        st1         {v5.1D}, [x0], x5
+        add         v4.4S, v9.4S, v1.4S
+        sub         v9.4S, v9.4S, v1.4S
+        sqrshrn     v5.4H, v4.4S, #7
+        st1         {v5.1D}, [x0], x5
+        add         v4.4S, v10.4S, v2.4S
+        sub         v10.4S, v10.4S, v2.4S
+        sqrshrn     v5.4H, v4.4S, #7
+        st1         {v5.1D}, [x0], x5
+        add         v4.4S, v11.4S, v3.4S
+        sub         v11.4S, v11.4S, v3.4S
+        sqrshrn     v5.4H, v4.4S, #7
+        st1         {v5.1D}, [x0], x5
+        ld1         {v0.2D - v3.2D}, [x10], #64
+        add         v4.4S, v12.4S, v0.4S
+        sub         v12.4S, v12.4S, v0.4S
+        sqrshrn     v5.4H, v4.4S, #7
+        st1         {v5.1D}, [x0], x5
+        add         v4.4S, v13.4S, v1.4S
+        sub         v13.4S, v13.4S, v1.4S
+        sqrshrn     v5.4H, v4.4S, #7
+        st1         {v5.1D}, [x0], x5
+        add         v4.4S, v14.4S, v2.4S
+        sub         v14.4S, v14.4S, v2.4S
+        sqrshrn     v5.4H, v4.4S, #7
+        st1         {v5.1D}, [x0], x5
+        add         v4.4S, v15.4S, v3.4S
+        sub         v15.4S, v15.4S, v3.4S
+        sqrshrn     v5.4H, v4.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sqrshrn     v5.4H, v15.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sqrshrn     v5.4H, v14.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sqrshrn     v5.4H, v13.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sqrshrn     v5.4H, v12.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sqrshrn     v5.4H, v11.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sqrshrn     v5.4H, v10.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sqrshrn     v5.4H, v9.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sqrshrn     v5.4H, v8.4S, #7
+        st1         {v5.1D}, [x0], x5
+        sub         x0, x0, x5, lsl #4
+        add         x0, x0, #8
+        add         x4, x4, #4
+        cmp         x4, x1
+        b.lt        0b
+        sub         x0, x0, x4, lsl #1
+        mov         x4, #4
+1:      ld1         {v0.2D - v1.2D}, [x3]
+        ld1         {v16.1D - v19.1D}, [x0], #32
+        ld1         {v20.1D - v23.1D}, [x0], #32
+        ld1         {v24.1D - v27.1D}, [x0], #32
+        ld1         {v28.1D - v31.1D}, [x0], #32
+        sub         x0, x0, #128
+        transpose_16b_4x4   v16, v20, v24, v28, v11, v12, v13, v14
+        transpose_16b_4x4   v17, v21, v25, v29, v11, v12, v13, v14
+        transpose_16b_4x4   v18, v22, v26, v30, v11, v12, v13, v14
+        transpose_16b_4x4   v19, v23, v27, v31, v11, v12, v13, v14
+        tr16_begin  v20, v28, v21, v29, v22, v30, v23, v31
+        sub         x10, x10, #128
+        st1         {v2.2D - v5.2D}, [x10], #64
+        st1         {v6.2D - v9.2D}, [x10], #64
+        tr8_begin   v24, v25, v26, v27
+        tr4         v16, v17, v18, v19
+        tr8_end_0
+        sub         x10, x10, #128
+        ld1         {v0.2D - v3.2D}, [x10], #64
+        add         v4.4S, v8.4S, v0.4S
+        sub         v18.4S, v8.4S, v0.4S
+        sqrshrn     v0.4H, v4.4S, #12
+        add         v4.4S, v9.4S, v1.4S
+        sub         v19.4S, v9.4S, v1.4S
+        sqrshrn     v1.4H, v4.4S, #12
+        add         v4.4S, v10.4S, v2.4S
+        sub         v20.4S, v10.4S, v2.4S
+        sqrshrn     v2.4H, v4.4S, #12
+        add         v4.4S, v11.4S, v3.4S
+        sub         v21.4S, v11.4S, v3.4S
+        sqrshrn     v3.4H, v4.4S, #12
+        ld1         {v4.2D - v7.2D}, [x10], #64
+        add         v10.4S, v12.4S, v4.4S
+        sub         v22.4S, v12.4S, v4.4S
+        sqrshrn     v4.4H, v10.4S, #12
+        add         v10.4S, v13.4S, v5.4S
+        sub         v23.4S, v13.4S, v5.4S
+        sqrshrn     v5.4H, v10.4S, #12
+        add         v10.4S, v14.4S, v6.4S
+        sub         v24.4S, v14.4S, v6.4S
+        sqrshrn     v6.4H, v10.4S, #12
+        add         v10.4S, v15.4S, v7.4S 
+        sub         v25.4S, v15.4S, v7.4S
+        sqrshrn     v7.4H, v10.4S, #12
+        sqrshrn     v8.4H, v25.4S, #12
+        sqrshrn     v9.4H, v24.4S, #12
+        sqrshrn     v10.4H, v23.4S, #12
+        sqrshrn     v11.4H, v22.4S, #12
+        sqrshrn     v12.4H, v21.4S, #12
+        sqrshrn     v13.4H, v20.4S, #12
+        sqrshrn     v14.4H, v19.4S, #12
+        sqrshrn     v15.4H, v18.4S, #12
+        transpose_16b_4x4 v0, v1, v2, v3, v20, v21, v22, v23
+        transpose_16b_4x4 v4, v5, v6, v7, v20, v21, v22, v23
+        transpose_16b_4x4 v8, v9, v10, v11, v20, v21, v22, v23
+        transpose_16b_4x4 v12, v13, v14, v15, v20, v21, v22, v23
+        trn1        v20.2D, v0.2D, v4.2D                    //vswp      d9, d12
+        trn1        v21.2D, v8.2D, v12.2D                   //vswp      d3, d6
+        trn1        v22.2D, v1.2D, v5.2D                    //vswp      d3, d9
+        trn1        v23.2D, v9.2D, v13.2D                   //vswp      d6, d12
+        trn1        v24.2D, v2.2D, v6.2D                    //vswp      d1, d4
+        trn1        v25.2D, v10.2D, v14.2D                  //vswp      d11, d14
+        trn1        v26.2D, v3.2D, v7.2D                    //vswp      d2, d8
+        trn1        v27.2D, v11.2D, v15.2D                  //vswp      d7, d13
+        st1         {v20.2D - v23.2D}, [x0], #64
+        st1         {v24.2D - v27.2D}, [x0], #64
+        subs        x4, x4, #1
+        b.ne        1b
+        ret
+endfunc
+
+.macro  write32_buffer
+        st1         {v0.1D - v3.1D}, [x8], #32
+        st1         {v4.1D - v7.1D}, [x8], #32
+        st1         {v8.1D - v11.1D}, [x8], #32
+        st1         {v12.1D - v15.1D}, [x8], #32
+        st1         {v16.1D - v19.1D}, [x8], #32
+        st1         {v20.1D - v23.1D}, [x8], #32
+        st1         {v24.1D - v27.1D}, [x8], #32
+        st1         {v28.1D - v31.1D}, [x8], #32
+.endm
+
+.macro  tr32_out tmp, dst_first, dst_last, shift
+        ld1         {v12.2D}, [x6], #16
+        ld1         {v13.2D}, [x7], #16
+        add         \tmp\().4S, v12.4S, v13.4S
+        sub         v13.4S, v13.4S, v12.4S
+        sqrshrn     \dst_first\().4H, \tmp\().4S, \shift
+        sqrshrn     \dst_last\().4H, v13.4S, \shift
+.endm
+
+.macro  tr32_transform_func shift, action, limit
+        mov         x4, XZR
+        mov         x10, x8
+0:      add         x3, x3, #32
+        ld1         {v0.2D - v1.2D}, [x3]
+        sub         x3, x3, #32
+        add         x0, x0, x5
+        lsl         x5, x5, #1
+        ld1         {v16.1D}, [x0], x5
+        ld1         {v17.1D}, [x0], x5
+        ld1         {v18.1D}, [x0], x5
+        ld1         {v19.1D}, [x0], x5
+        ld1         {v20.1D}, [x0], x5
+        ld1         {v21.1D}, [x0], x5
+        ld1         {v22.1D}, [x0], x5
+        ld1         {v23.1D}, [x0], x5
+        ld1         {v24.1D}, [x0], x5
+        ld1         {v25.1D}, [x0], x5
+        ld1         {v26.1D}, [x0], x5
+        ld1         {v27.1D}, [x0], x5
+        ld1         {v28.1D}, [x0], x5
+        ld1         {v29.1D}, [x0], x5
+        ld1         {v30.1D}, [x0], x5
+        ld1         {v31.1D}, [x0], x5
+        sub         x0, x0, x5, lsl #4
+        tr32_begin
+        add         x0, x0, x5, lsr #1
+        sub         x6, x6, x5, lsl #1
+        ld1         {v0.2D - v1.2D}, [x3]
+        lsl         x5, x5, #1
+        ld1         {v24.1D}, [x0], x5 
+        ld1         {v25.1D}, [x0], x5
+        ld1         {v26.1D}, [x0], x5
+        ld1         {v27.1D}, [x0], x5
+        ld1         {v28.1D}, [x0], x5
+        ld1         {v29.1D}, [x0], x5
+        ld1         {v30.1D}, [x0], x5
+        ld1         {v31.1D}, [x0], x5
+        sub         x0, x0, x5, lsl #3
+        sub         x0, x0, x5, lsr #1
+        tr16_begin  v24, v25, v26, v27, v28, v29, v30, v31
+        st1         {v2.2D - v5.2D}, [x9], #64
+        st1         {v6.2D - v9.2D}, [x9], #64
+        sub         x9, x9, #128
+        ld1         {v24.1D}, [x0], x5
+        ld1         {v25.1D}, [x0], x5
+        ld1         {v26.1D}, [x0], x5
+        ld1         {v27.1D}, [x0], x5
+        ld1         {v28.1D}, [x0], x5
+        ld1         {v29.1D}, [x0], x5
+        ld1         {v30.1D}, [x0], x5
+        ld1         {v31.1D}, [x0], x5
+        sub         x0, x0, x5, lsl #3
+        tr8_begin   v25, v27, v29, v31
+        tr4         v24, v26, v28, v30
+        tr8_end_0
+        ld1         {v0.2D - v3.2D}, [x9], #64
+        add         v4.4S, v8.4S, v0.4S
+        sub         v8.4S, v8.4S, v0.4S
+        add         v5.4S, v9.4S, v1.4S
+        sub         v9.4S, v9.4S, v1.4S
+        add         v6.4S, v10.4S, v2.4S
+        sub         v10.4S, v10.4S, v2.4S
+        add         v7.4S, v11.4S, v3.4S
+        sub         v11.4S, v11.4S, v3.4S
+        st1         {v4.2D - v7.2D}, [x7], #64
+        ld1         {v0.2D - v3.2D}, [x9], #64
+        sub         x9, x9, #128
+        add         v4.4S, v12.4S, v0.4S
+        sub         v12.4S, v12.4S, v0.4S
+        add         v5.4S, v13.4S, v1.4S
+        sub         v13.4S, v13.4S, v1.4S
+        add         v6.4S, v14.4S, v2.4S
+        sub         v14.4S, v14.4S, v2.4S
+        add         v7.4S, v15.4S, v3.4S
+        sub         v15.4S, v15.4S, v3.4S
+        st1         {v4.2D - v7.2D}, [x7], #64
+        st1         {v15.2D}, [x7], #16
+        st1         {v14.2D}, [x7], #16
+        st1         {v13.2D}, [x7], #16
+        st1         {v12.2D}, [x7], #16
+        st1         {v11.2D}, [x7], #16
+        st1         {v10.2D}, [x7], #16
+        st1         {v9.2D}, [x7], #16
+        st1         {v8.2D}, [x7], #16
+        sub         x7, x7, x5
+        add         x0, x0, #8
+        lsr         x5, x5, #2
+        tr32_out    v14, v0, v28, \shift
+        tr32_out    v14, v8, v29, \shift
+        tr32_out    v14, v16, v30, \shift
+        tr32_out    v14, v24, v31, \shift
+        st1         {v28.1D - v31.1D}, [x9], #32
+        tr32_out    v14, v1, v28, \shift
+        tr32_out    v14, v9, v29, \shift
+        tr32_out    v14, v17, v30, \shift
+        tr32_out    v14, v25, v31, \shift
+        st1         {v28.1D - v31.1D}, [x9], #32
+        tr32_out    v14, v2, v28, \shift
+        tr32_out    v14, v10, v29, \shift
+        tr32_out    v14, v18, v30, \shift
+        tr32_out    v14, v26, v31, \shift
+        st1         {v28.1D - v31.1D}, [x9], #32
+        tr32_out    v14, v3, v28, \shift
+        tr32_out    v14, v11, v29, \shift
+        tr32_out    v14, v19, v30, \shift
+        tr32_out    v14, v27, v31, \shift
+        st1         {v28.1D - v31.1D}, [x9], #32
+        sub         x9, x9, #128
+        transpose_16b_4x4 v0, v8, v16, v24, v28, v29, v30, v31
+        transpose_16b_4x4 v1, v9, v17, v25, v28, v29, v30, v31
+        transpose_16b_4x4 v2, v10, v18, v26, v28, v29, v30, v31
+        transpose_16b_4x4 v3, v11, v19, v27, v28, v29, v30, v31
+        ld1         {v31.1D}, [x9], #8
+        ld1         {v23.1D}, [x9], #8
+        ld1         {v15.1D}, [x9], #8
+        ld1         {v7.1D},  [x9], #8
+        ld1         {v30.1D}, [x9], #8
+        ld1         {v22.1D}, [x9], #8
+        ld1         {v14.1D}, [x9], #8
+        ld1         {v6.1D},  [x9], #8
+        ld1         {v29.1D}, [x9], #8
+        ld1         {v21.1D}, [x9], #8
+        ld1         {v13.1D}, [x9], #8
+        ld1         {v5.1D},  [x9], #8
+        ld1         {v28.1D}, [x9], #8
+        ld1         {v20.1D}, [x9], #8
+        ld1         {v12.1D}, [x9], #8
+        ld1         {v4.1D},  [x9], #8
+        sub         x9, x9, #128
+        st1         {v24.1D - v27.1D}, [x9]
+        transpose_16b_4x4 v4, v12, v20, v28, v24, v25, v26, v27
+        transpose_16b_4x4 v5, v13, v21, v29, v24, v25, v26, v27
+        transpose_16b_4x4 v6, v14, v22, v30, v24, v25, v26, v27
+        transpose_16b_4x4 v7, v15, v23, v31, v24, v25, v26, v27
+        ld1         {v24.1D - v27.1D}, [x9]
+        \action
+        sub         x6, x6, #256
+        sub         x7, x7, #256
+        add         x4, x4, #4
+        cmp         x4, \limit
+        b.lt        0b
+        sub         x0, x0, x4, lsl #1
+1:      cmp         x4, #32
+        b.ge        2f
+        \action
+        add         x4, x4, #4
+        b           1b
+2:      mov         x8, x10
+.endm
+
+const trans_coeff, align=8
+.quad 0x0000000000240053  // 36, 83
+.quad 0x003200120059004b  // 89, 75, 50, 18
+.quad 0x00500046005a0057  // 90, 87, 80, 70
+.quad 0x001900090039002b  // 57, 43, 25, 9
+.quad 0x00550058005a005a  // 88, 85, 90, 90
+.quad 0x00430049004e0052  // 73, 67, 82, 78
+.quad 0x0026002e0036003d  // 46, 38, 61, 54
+.quad 0x0004000d0016001f  // 13, 04, 31, 22
+endconst
+
+function ff_hevc_transform_32x32_neon_8, export=1
+        mov         x6, #32
+        add         x7, x1, #4
+        cmp         x1, #28
+        csel        x1, x6, x7, gt
+        mov         x2, x1
+        movrel      x3, trans_coeff
+        sub         x6, sp, #256
+        mov         x7, #63
+        bic         x6, x6, x7
+        sub         x7, x6, #256
+        sub         x8, x7, #2048
+        sub         x9, x8, #256
+        mov         x5, #64
+        tr32_transform_func #7, write32_buffer, x2
+        mov         x2, x0
+        mov         x0, x8
+        mov         x8, x2
+        tr32_transform_func #12, write32_buffer, #32
+        ret
+endfunc
+
+function ff_hevc_idct_4x4_dc_neon_8, export=1
+        ldrsh       w2, [x0]
+        dup         v8.8H, w2
+        srshr       v8.8H, v8.8H, #1
+        srshr       v8.8H, v8.8H, #6
+        mov         v9.16B, v8.16B
+        st1         {v8.2D, v9.2D}, [x0]
+        ret
+endfunc
+
+function ff_hevc_idct_8x8_dc_neon_8, export=1
+        ldrsh       w2, [x0]
+        dup         v8.8H, w2
+        srshr       v8.8H, v8.8H, #1
+        srshr       v8.8H, v8.8H, #6
+        mov         v9.16B,  v8.16B
+        mov         v10.16B, v8.16B
+        mov         v11.16B, v8.16B
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0]
+        ret
+endfunc
+
+function ff_hevc_idct_16x16_dc_neon_8, export=1
+        ldrsh       w2, [x0]
+        dup         v8.8H, w2
+        srshr       v8.8H, v8.8H, #1
+        srshr       v8.8H, v8.8H, #6
+        mov         v9.16B,  v8.16B
+        mov         v10.16B, v8.16B
+        mov         v11.16B, v8.16B
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0]
+        ret
+endfunc
+
+function ff_hevc_idct_32x32_dc_neon_8, export=1
+        ldrsh       w2, [x0]
+        dup         v8.8H, w2
+        srshr       v8.8H, v8.8H, #1
+        srshr       v8.8H, v8.8H, #6
+        mov         v9.16B,  v8.16B
+        mov         v10.16B, v8.16B
+        mov         v11.16B, v8.16B
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v8.2D - v11.2D}, [x0]
+        ret
+endfunc
+
+function ff_hevc_transform_add_4x4_neon_8, export=1
+        ld1         {v0.2D, v1.2D}, [x1]
+        ld1         {v2.S}[0], [x0], x2
+        ld1         {v2.S}[1], [x0], x2
+        ld1         {v3.S}[0], [x0], x2
+        ld1         {v3.S}[1], [x0], x2
+        sub         x0, x0, x2, lsl #2
+        uxtl        v8.8H, v2.8B
+        uxtl        v9.8H, v3.8B
+        sqadd       v0.8H, v0.8H, v8.8H
+        sqadd       v1.8H, v1.8H, v9.8H
+        sqxtun      v4.8B, v0.8H
+        sqxtun      v5.8B, v1.8H
+        st1         {v4.S}[0], [x0], x2
+        st1         {v4.S}[1], [x0], x2
+        st1         {v5.S}[0], [x0], x2
+        st1         {v5.S}[1], [x0], x2
+        ret
+endfunc
+
+function ff_hevc_transform_add_8x8_neon_8, export=1
+        mov         x3, #8
+1:      subs        x3, x3, #1
+        ld1         {v0.2D}, [x1], #16
+        ld1         {v8.1D}, [x0]
+        uxtl        v8.8H, v8.8B
+        sqadd       v0.8H, v0.8H, v8.8H
+        sqxtun      v4.8B, v0.8H
+        st1         {v4.1D}, [x0], x2
+        b.ne        1b
+        ret
+endfunc
+
+function ff_hevc_transform_add_16x16_neon_8, export=1
+        mov         x3, #16
+1:      subs        x3, x3, #1
+        ld1         {v0.2D - v1.2D}, [x1], #32
+        ld1         {v8.2D}, [x0]
+        uxtl        v9.8H, v8.8B
+        uxtl2       v10.8H, v8.16B
+        sqadd       v0.8H, v0.8H, v9.8H
+        sqadd       v1.8H, v1.8H, v10.8H
+        sqxtun      v4.8B, v0.8H
+        sqxtun2     v4.16B, v1.8H
+        st1         {v4.2D}, [x0], x2
+        b.ne        1b
+        ret
+endfunc
+
+function ff_hevc_transform_add_32x32_neon_8, export=1
+        mov         x3, #32
+1:      subs        x3, x3, #1
+        ld1         {v0.2D - v3.2D}, [x1], #64
+        ld1         {v8.2D, v9.2D}, [x0]
+        uxtl        v10.8H, v8.8B
+        uxtl2       v11.8H, v8.16B
+        uxtl        v12.8H, v9.8B
+        uxtl2       v13.8H, v9.16B
+        sqadd       v0.8H, v0.8H, v10.8H
+        sqadd       v1.8H, v1.8H, v11.8H
+        sqadd       v2.8H, v2.8H, v12.8H
+        sqadd       v3.8H, v3.8H, v13.8H
+        sqxtun      v4.8B, v0.8H
+        sqxtun2     v4.16B, v1.8H
+        sqxtun      v5.8B, v2.8H
+        sqxtun2     v5.16B, v3.8H
+        st1         {v4.2D, v5.2D}, [x0], x2
+        b.ne        1b
+        ret
+endfunc
+
+function ff_hevc_transform_4x4_neon_8, export=1
+        ld1         {v28.2D - v29.2D}, [x0]
+        ldr         w2, =0x00240053
+        mov         v0.S[0], w2
+        tr4_shift   v28, v29, #7 //3210->3120
+        zip1        v26.8H, v28.8H, v29.8H
+        zip2        v27.8H, v28.8H, v29.8H
+        zip1        v28.4S, v26.4S, v27.4S
+        zip2        v29.4S, v26.4S, v27.4S
+        tr4_shift   v28, v29, #12 //3210->3120
+        zip1        v26.8H, v28.8H, v29.8H
+        zip2        v27.8H, v28.8H, v29.8H
+        zip1        v28.4S, v26.4S, v27.4S
+        zip2        v29.4S, v26.4S, v27.4S
+        st1         {v28.2D - v29.2D}, [x0]
+        ret	
+endfunc
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        ld1         {v28.2D - v29.2D}, [x0]
+        ldr         x1, =0x0037001d004a
+        mov         v1.D[0], x1
+        uxtl        v0.4S, v1.4H
+        tr4_luma_shift  v28, v29, #7
+        zip1        v26.8H, v28.8H, v29.8H
+        zip2        v27.8H, v28.8H, v29.8H
+        zip1        v28.4S, v26.4S, v27.4S
+        zip2        v29.4S, v26.4S, v27.4S
+        tr4_luma_shift	v28, v29, #12
+        zip1        v26.8H, v28.8H, v29.8H
+        zip2        v27.8H, v28.8H, v29.8H
+        zip1        v28.4S, v26.4S, v27.4S
+        zip2        v29.4S, v26.4S, v27.4S
+        st1         {v28.2D - v29.2D}, [x0]
+        ret			
+endfunc
-- 
2.3.2 (Apple Git-55)



From 7aa6c4482d67f715e7d799294aeb06a0e6cd786a Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:16:28 +0800
Subject: [PATCH 02/12] Create hevcdsp_init_aarch64.c


Used to add aarch64 neon optimization for HEVC decoder


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/hevcdsp_init_aarch64.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 libavcodec/hevcdsp_init_aarch64.c


diff --git a/libavcodec/hevcdsp_init_aarch64.c b/libavcodec/hevcdsp_init_aarch64.c
new file mode 100644
index 0000000..e8c2802
--- /dev/null
+++ b/libavcodec/hevcdsp_init_aarch64.c
@@ -0,0 +1,33 @@
+/*
+ * ARM NEON optimised HEVC decode for aarch64
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_aarch64.h"
+
+av_cold void ff_hevcdsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        ff_hevcdsp_init_neon(c, bit_depth);
+}
-- 
2.3.2 (Apple Git-55)



From 6eccea0d8600c64972af9f5b02e84a7499ca4133 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:19:25 +0800
Subject: [PATCH 03/12] Create hevcdsp_init_neon.c


Added for Aarch64 NEON optimization for hevc decoder


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/aarch64/hevcdsp_init_neon.c | 64 ++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_init_neon.c


diff --git a/libavcodec/aarch64/hevcdsp_init_neon.c b/libavcodec/aarch64/hevcdsp_init_neon.c
new file mode 100644
index 0000000..0a3b2e5
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_init_neon.c
@@ -0,0 +1,64 @@
+/*
+ *  ARM NEON optimised HEVC for armv8 instruct functions
+ *  Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_aarch64.h"
+
+void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_16x16_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_32x32_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+
+
+av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+    if (bit_depth == 8) {
+        int x;
+        c->idct[0]                          = ff_hevc_transform_4x4_neon_8;
+        c->idct[1]                          = ff_hevc_transform_8x8_neon_8;
+        c->idct[2]                          = ff_hevc_transform_16x16_neon_8;
+        c->idct[3]                          = ff_hevc_transform_32x32_neon_8;
+        c->idct_dc[0]                       = ff_hevc_idct_4x4_dc_neon_8;
+        c->idct_dc[1]                       = ff_hevc_idct_8x8_dc_neon_8;
+        c->idct_dc[2]                       = ff_hevc_idct_16x16_dc_neon_8;
+        c->idct_dc[3]                       = ff_hevc_idct_32x32_dc_neon_8;
+        c->transform_add[0]                 = ff_hevc_transform_add_4x4_neon_8;
+        c->transform_add[1]                 = ff_hevc_transform_add_8x8_neon_8;
+        c->transform_add[2]                 = ff_hevc_transform_add_16x16_neon_8;
+        c->transform_add[3]                 = ff_hevc_transform_add_32x32_neon_8;
+        c->idct_4x4_luma                    = ff_hevc_transform_luma_4x4_neon_8;
+    }
+}
-- 
2.3.2 (Apple Git-55)



From 5ac978c2df7bdf891c0149ed83b7f9a6b5317a0c Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:22:17 +0800
Subject: [PATCH 04/12] Update Makefile to add Aach64 NEON optim HEVC decoder


Used to add NEON optimization for HEVC decoder


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/aarch64/Makefile | 4 ++++
 1 file changed, 4 insertions(+)


diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index d001b34..3ef75b8 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -3,6 +3,7 @@ OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
 OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
+OBJS-$(CONFIG_HEVC_DECODER)             += aarch64/hevcdsp_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IMDCT15)                  += aarch64/imdct15_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
@@ -28,3 +29,6 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
+
+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_init_neon.o		\
+                                           aarch64/hevcdsp_idct_neon.o
-- 
2.3.2 (Apple Git-55)



From 4bb10fc74c38dfcbca4354132189b1032d14a8cb Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:25:44 +0800
Subject: [PATCH 05/12] Add Aarch64 neon optim for HEVC decoder


Used to add Aarch64 neon optimization for HEVC decoder


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/hevcdsp.c | 2 ++
 1 file changed, 2 insertions(+)


diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 9d773d9..bfe37ef 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -259,6 +259,8 @@ int i = 0;
 
     if (ARCH_X86)
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+    if (ARCH_AARCH64)
+        ff_hevcdsp_init_aarch64(hevcdsp, bit_depth);
     if (ARCH_ARM)
         ff_hevcdsp_init_arm(hevcdsp, bit_depth);
     if (ARCH_MIPS)
-- 
2.3.2 (Apple Git-55)



From 5f139d949dd8d1a939e9b6f4836ecd81ad8298b5 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Mon, 18 Jan 2016 17:27:26 +0800
Subject: [PATCH 06/12] Add Aarch64 neon optim for HEVC decoder


Used to add aarch64 neon optimization for HEVC decoder


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/hevcdsp.h | 1 +
 1 file changed, 1 insertion(+)


diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 9f1f6dd..757a441 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -128,6 +128,7 @@ extern const int8_t ff_hevc_epel_filters[7][4];
 extern const int8_t ff_hevc_qpel_filters[3][16];
 
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
+void ff_hevcdsp_init_aarch64(HEVCDSPContext *c, const int bit_depth);
 void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
 #endif /* AVCODEC_HEVCDSP_H */
-- 
2.3.2 (Apple Git-55)



From 97d72592f61d267ceb57db0cf43f78cecb14efc1 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Tue, 19 Jan 2016 23:25:35 +0800
Subject: [PATCH 07/12] Create hevcdsp_aarch64.h


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/aarch64/hevcdsp_aarch64.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_aarch64.h


diff --git a/libavcodec/aarch64/hevcdsp_aarch64.h b/libavcodec/aarch64/hevcdsp_aarch64.h
new file mode 100644
index 0000000..d44fdc1
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_aarch64.h
@@ -0,0 +1,29 @@
+/*
+ *  ARM NEON optimised HEVC for armv8 instruct functions
+ *  Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_HEVCDSP_AARCH64_H
+#define AVCODEC_AARCH64_HEVCDSP_AARCH64_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth);
+
+#endif /* AVCODEC_AARCH64_HEVCDSP_AARCH64_H */
-- 
2.3.2 (Apple Git-55)



From fd5e4e9c00f3e66a99483aba4d4db371ef9d1923 Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Wed, 27 Jan 2016 22:41:03 +0800
Subject: [PATCH 08/12] Add qpel neon optimization for HEVC decoder


Add qpel neon optimization for HEVC decoder


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/aarch64/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 3ef75b8..2a6c95c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -31,4 +31,5 @@ NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
 
 NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_init_neon.o		\
-                                           aarch64/hevcdsp_idct_neon.o
+                                           aarch64/hevcdsp_idct_neon.o		\
+                                           aarch64/hevcdsp_qpel_neon.o
-- 
2.3.2 (Apple Git-55)
From f5ccce6494d6e1dee63560d4e927efa15f5facfe Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Wed, 27 Jan 2016 22:44:46 +0800
Subject: [PATCH 09/12] Create hevcdsp_qpel_neon.S


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/aarch64/hevcdsp_qpel_neon.S | 1418 ++++++++++++++++++++++++++++++++
 1 file changed, 1418 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S


diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000..356aa55
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,1418 @@
+/*
+ * ARM NEON optimised HEVC decode for aarch64
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+#define MAX_PB_DOUBLESIZE #128
+
+.macro init_put_pixels
+        prfm PLDL1STRM,   [x1]
+        prfm PLDL1STRM,   [x1, x2]
+        mov  x12, MAX_PB_DOUBLESIZE
+.endm
+
+function ff_hevc_put_pixels_w2_neon_8, export=1
+        init_put_pixels
+0:      subs    x3, x3, #2
+        ld1         {v0.H}[0], [x1], x2
+        ld1         {v0.H}[1], [x1], x2
+        ushll       v0.8H, v0.8B, #6
+        st1         {v0.S}[0], [x0], x12
+        st1         {v0.S}[1], [x0], x12
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w4_neon_8, export=1
+        init_put_pixels
+0:      subs    x3, x3, #2
+        ld1         {v0.S}[0], [x1], x2
+        ld1         {v0.S}[1], [x1], x2
+        ushll       v0.8H, v0.8B, #6
+        st1         {v0.D}[0], [x0], x12
+        st1         {v0.D}[1], [x0], x12
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w6_neon_8, export=1
+        init_put_pixels
+        sub     x10, x2, #4
+        sub     x11, x12, #8
+0:      subs    x3, x3, #2
+        ld1         {v0.S}[0], [x1], #4
+        ld1         {v1.H}[0], [x1], x10
+        ld1         {v0.S}[1], [x1], #4
+        ld1         {v1.H}[1], [x1], x10
+        ushll       v0.8H, v0.8B, #6
+        ushll       v1.8H, v1.8B, #6
+        st1         {v0.1D}, [x0], #8
+        st1         {v1.S}[0], [x0], x11
+        st1         {v0.D}[1], [x0], #8
+        st1         {v1.S}[1], [x0], x11
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w8_neon_8, export=1
+        init_put_pixels
+0:      subs    x3, x3, #2
+        ld1         {v0.8B}, [x1], x2
+        ld1         {v1.8B}, [x1], x2
+        ushll       v8.8H, v0.8B, #6
+        ushll       v9.8H, v1.8B, #6
+        st1         {v8.8H}, [x0], x12
+        st1         {v9.8H}, [x0], x12
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w12_neon_8, export=1
+        init_put_pixels
+        sub     x10, x2, #8
+        sub     x11, x12, #16
+0:      subs    x3, x3, #2
+        ld1         {v0.1D}, [x1], #8
+        ld1         {v2.S}[0], [x1], x10
+        ld1         {v1.1D}, [x1], #8
+        ld1         {v2.S}[1], [x1], x10
+        ushll       v0.8H, v0.8B, #6
+        ushll       v1.8H, v1.8B, #6
+        ushll       v2.8H, v2.8B, #6
+        st1         {v0.2D}, [x0], #16
+        st1         {v2.1D}, [x0], x11
+        st1         {v1.2D}, [x0], #16
+        st1         {v2.D}[1], [x0], x11
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w16_neon_8, export=1
+        init_put_pixels
+0:      subs    x3, x3, #2
+        ld1         {v0.2D}, [x1], x2
+        ld1         {v1.2D}, [x1], x2
+        ushll       v8.8H, v0.8B, #6
+        ushll2      v9.8H, v0.16B, #6
+        ushll       v10.8H, v1.8B, #6
+        ushll2      v11.8H, v1.16B, #6
+        st1         {v8.2D - v9.2D}, [x0], x12
+        st1         {v10.2D - v11.2D}, [x0], x12
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w24_neon_8, export=1
+        init_put_pixels
+0:      subs    x3, x3, #1
+        ld1         {v0.1D - v2.1D}, [x1], x2
+        ushll       v8.8H, v0.8B, #6
+        ushll       v9.8H, v1.8B, #6
+        ushll       v10.8H, v2.8B, #6
+        st1         {v8.2D - v10.2D}, [x0], x12
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w32_neon_8, export=1
+        init_put_pixels
+0:      subs    x3, x3, #1
+        ld1         {v0.2D - v1.2D}, [x1], x2
+        ushll       v8.8H, v0.8B, #6
+        ushll2      v9.8H, v0.16B, #6
+        ushll       v10.8H, v1.8B, #6
+        ushll2      v11.8H, v1.16B, #6
+        st1         {v8.2D - v11.2D}, [x0], x12
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w48_neon_8, export=1
+        init_put_pixels
+        sub     x11, x12, #64
+0:      subs    x3, x3, #1
+        ld1         {v0.2D - v2.2D}, [x1], x2
+        ushll       v8.8H, v0.8B, #6
+        ushll2      v9.8H, v0.16B, #6
+        ushll       v10.8H, v1.8B, #6
+        ushll2      v11.8H, v1.16B, #6
+        ushll       v12.8H, v2.8B, #6
+        ushll2      v13.8H, v2.16B, #6
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v12.2D - v13.2D}, [x0], x11
+        b.ne    0b
+        ret
+endfunc
+
+function ff_hevc_put_pixels_w64_neon_8, export=1
+        init_put_pixels
+        sub     x11, x12, #64
+0:      subs    x3, x3, #1
+        ld1         {v0.2D - v3.2D}, [x1], x2
+        ushll       v8.8H, v0.8B, #6
+        ushll2      v9.8H, v0.16B, #6
+        ushll       v10.8H, v1.8B, #6
+        ushll2      v11.8H, v1.16B, #6
+        ushll       v12.8H, v2.8B, #6
+        ushll2      v13.8H, v2.16B, #6
+        ushll       v14.8H, v3.8B, #6
+        ushll2      v15.8H, v3.16B, #6
+        st1         {v8.2D - v11.2D}, [x0], #64
+        st1         {v12.2D - v15.2D}, [x0], x11
+        b.ne    0b
+        ret
+endfunc
+
+.macro  regshuffle_d8
+        mov         v16.8B, v17.8B
+        mov         v17.8B, v18.8B
+        mov         v18.8B, v19.8B
+        mov         v19.8B, v20.8B
+        mov         v20.8B, v21.8B
+        mov         v21.8B, v22.8B
+        mov         v22.8B, v23.8B
+.endm
+
+.macro  regshuffle_v8
+        mov         v0.16B, v1.16B
+        mov         v1.16B, v2.16B
+        mov         v2.16B, v3.16B
+        mov         v3.16B, v4.16B
+        mov         v4.16B, v5.16B
+        mov         v5.16B, v6.16B
+        mov         v6.16B, v7.16B
+.endm
+
+.macro  vextin8
+        prfm PLDL1STRM,   [x2]
+        ld1         {v22.1D - v23.1D}, [x2], x3
+        ext         v16.8B, v22.8B, v23.8B, #1
+        ext         v17.8B, v22.8B, v23.8B, #2
+        ext         v18.8B, v22.8B, v23.8B, #3
+        ext         v19.8B, v22.8B, v23.8B, #4
+        ext         v20.8B, v22.8B, v23.8B, #5
+        ext         v21.8B, v22.8B, v23.8B, #6
+        ext         v22.8B, v22.8B, v23.8B, #7
+.endm
+
+.macro  vextin8_4
+        prfm PLDL1STRM,   [x2]
+        ld1         {v22.1D - v23.1D}, [x2], x3
+        ld1         {v24.1D - v25.1D}, [x2], x3
+        ext         v16.8B, v22.8B, v23.8B, #1
+        ext         v17.8B, v22.8B, v23.8B, #2
+        ext         v18.8B, v22.8B, v23.8B, #3
+        ext         v19.8B, v22.8B, v23.8B, #4
+        ext         v20.8B, v22.8B, v23.8B, #5
+        ext         v21.8B, v22.8B, v23.8B, #6
+        ext         v22.8B, v22.8B, v23.8B, #7
+        ext         v26.8B, v24.8B, v25.8B, #1
+        ext         v27.8B, v24.8B, v25.8B, #2
+        ext         v28.8B, v24.8B, v25.8B, #3
+        ext         v29.8B, v24.8B, v25.8B, #4
+        ext         v30.8B, v24.8B, v25.8B, #5
+        ext         v31.8B, v24.8B, v25.8B, #6
+        ext         v24.8B, v24.8B, v25.8B, #7
+        trn1        v16.4S, v16.4S, v26.4S
+        trn1        v17.4S, v17.4S, v27.4S
+        trn1        v18.4S, v18.4S, v28.4S
+        trn1        v19.4S, v19.4S, v29.4S
+        trn1        v20.4S, v20.4S, v30.4S
+        trn1        v21.4S, v21.4S, v31.4S
+        trn1        v22.4S, v22.4S, v24.4S
+        trn1        v23.4S, v23.4S, v25.4S
+.endm
+
+.macro  loadin8
+        prfm PLDL1STRM,   [x2]
+        ld1         {v16.1D}, [x2], x3
+        prfm PLDL1STRM,   [x2]
+        ld1         {v17.1D}, [x2], x3
+        prfm PLDL1STRM,   [x2]
+        ld1         {v18.1D}, [x2], x3
+        prfm PLDL1STRM,   [x2]
+        ld1         {v19.1D}, [x2], x3
+        prfm PLDL1STRM,   [x2]
+        ld1         {v20.1D}, [x2], x3
+        prfm PLDL1STRM,   [x2]
+        ld1         {v21.1D}, [x2], x3
+        prfm PLDL1STRM,   [x2]
+        ld1         {v22.1D}, [x2], x3
+        prfm PLDL1STRM,   [x2]
+        ld1         {v23.1D}, [x2], x3
+.endm
+
+.macro qpel_filter_1_32b
+        movi        v16.8H, #58
+        movi        v17.8H, #10
+        smull       v9.4S, v3.4H, v16.4H
+        smull2      v10.4S, v3.8H, v16.8H
+        movi        v16.8H, #17
+        smull       v11.4S, v2.4H, v17.4H
+        smull2      v12.4S, v2.8H, v17.8H
+        movi        v17.8H, #5
+        smull       v13.4S, v4.4H, v16.4H
+        smull2      v14.4S, v4.8H, v16.8H
+        smull       v15.4S, v5.4H, v17.4H
+        smull2      v8.4S, v5.8H, v17.8H
+        sub         v9.4S, v9.4S, v11.4S
+        sub         v10.4S, v10.4S, v12.4S
+        sshll       v11.4S, v1.4H, #2
+        sshll2      v12.4S, v1.8H, #2
+        add         v9.4S, v9.4S, v13.4S
+        add         v10.4S, v10.4S, v14.4S
+        ssubl       v13.4S, v6.4H, v0.4H
+        ssubl2      v14.4S, v6.8H, v0.8H
+        add         v9.4S, v9.4S, v11.4S
+        add         v10.4S, v10.4S, v12.4S
+        sub         v13.4S, v13.4S, v15.4S
+        sub         v14.4S, v14.4S, v8.4S
+        add         v9.4S, v9.4S, v13.4S
+        add         v10.4S, v10.4S, v14.4S
+        sqshrn      v8.4H, v9.4S, #6
+        sqshrn2     v8.8H, v10.4S, #6
+.endm
+
+.macro qpel_filter_2_32b
+        movi        v8.4S, #11
+        saddl       v9.4S, v3.4H, v4.4H
+        saddl2      v10.4S, v3.8H, v4.8H
+        saddl       v11.4S, v2.4H, v5.4H
+        saddl2      v12.4S, v2.8H, v5.8H
+        mul         v11.4S, v11.4S, v8.4S
+        mul         v12.4S, v12.4S, v8.4S
+        movi        v8.4S, #40
+        saddl       v15.4S, v1.4H, v6.4H
+        mul         v9.4S, v9.4S, v8.4S
+        mul         v10.4S, v10.4S, v8.4S
+        saddl2      v8.4S, v1.8H, v6.8H
+        saddl       v13.4S, v0.4H, v7.4H
+        saddl2      v14.4S, v0.8H, v7.8H
+        shl         v15.4S, v15.4S, #2
+        shl         v8.4S, v8.4S, #2
+        add         v11.4S, v11.4S, v13.4S
+        add         v12.4S, v12.4S, v14.4S
+        add         v9.4S, v9.4S, v15.4S
+        add         v10.4S, v10.4S, v8.4S
+        sub         v9.4S, v9.4S, v11.4S
+        sub         v10.4S, v10.4S, v12.4S
+        sqshrn      v8.4H, v9.4S, #6
+        sqshrn2     v8.8H, v10.4S, #6
+.endm
+
+.macro qpel_filter_3_32b
+        movi        v16.8H, #58
+        movi        v17.8H, #10
+        smull       v9.4S, v4.4H, v16.4H
+        smull2      v10.4S, v4.8H, v16.8H
+        movi        v16.8H, #17
+        smull       v11.4S, v5.4H, v17.4H
+        smull2      v12.4S, v5.8H, v17.8H
+        movi        v17.8H, #5
+        smull       v13.4S, v3.4H, v16.4H
+        smull2      v14.4S, v3.8H, v16.8H
+        smull       v15.4S, v2.4H, v17.4H
+        smull2      v8.4S, v2.8H, v17.8H
+        sub         v9.4S, v9.4S, v11.4S
+        sub         v10.4S, v10.4S, v12.4S
+        sshll       v11.4S, v6.4H, #2
+        sshll2      v12.4S, v6.8H, #2
+        add         v9.4S, v9.4S, v13.4S
+        add         v10.4S, v10.4S, v14.4S
+        ssubl       v13.4S, v1.4H, v7.4H
+        ssubl2      v14.4S, v1.8H, v7.8H
+        add         v9.4S, v9.4S, v11.4S
+        add         v10.4S, v10.4S, v12.4S
+        sub         v13.4S, v13.4S, v15.4S
+        sub         v14.4S, v14.4S, v8.4S
+        add         v9.4S, v9.4S, v13.4S
+        add         v10.4S, v10.4S, v14.4S
+        sqshrn      v8.4H, v9.4S, #6
+        sqshrn2     v8.8H, v10.4S, #6
+.endm
+
+.macro  qpel_filter_1 out=v7
+        movi        v24.8B, #58
+        movi        v25.8B, #10
+        ushll       v13.8H, v20.8B, #4
+        ushll       v14.8H, v21.8B, #2
+        umull       \out\().8H, v19.8B, v24.8B
+        uaddw       v13.8H, v13.8H, v20.8B
+        umull       v15.8H, v18.8B, v25.8B
+        uaddw       v14.8H, v14.8H, v21.8B
+        usubl       v12.8H, v22.8B, v16.8B
+        add         \out\().8H, \out\().8H, v13.8H
+        ushll       v13.8H, v17.8B, #2
+        add         v15.8H, v15.8H, v14.8H
+        add         v13.8H, v13.8H, v12.8H
+        sub         \out\().8H, \out\().8H, v15.8H
+        add         \out\().8H, \out\().8H, v13.8H
+.endm
+
+.macro  qpel_filter_2 out=v7
+        movi        v12.8H, #10
+        movi        v14.8H, #11
+        uaddl       v13.8H, v19.8B, v20.8B
+        uaddl       v15.8H, v18.8B, v21.8B
+        mul         v13.8H, v13.8H, v12.8H
+        mul         v15.8H, v15.8H, v14.8H
+        uaddl       \out\().8H, v17.8B, v22.8B
+        uaddl       v12.8H, v16.8B, v23.8B
+        add         \out\().8H, \out\().8H, v13.8H
+        add         v12.8H, v12.8H, v15.8H
+        shl         \out\().8H, \out\().8H, #2
+        sub         \out\().8H, \out\().8H, v12.8H
+.endm
+
+.macro  qpel_filter_3 out=v7
+        movi        v24.8B, #58
+        movi        v25.8B, #10
+        ushll       v13.8H, v19.8B, #4
+        ushll       v14.8H, v18.8B, #2
+        umull       \out\().8H, v20.8B, v24.8B
+        uaddw       v13.8H, v13.8H, v19.8B
+        umull       v15.8H, v21.8B, v25.8B
+        uaddw       v14.8H, v14.8H, v18.8B
+        usubl       v12.8H, v17.8B, v23.8B
+        add         \out\().8H, \out\().8H, v13.8H
+        ushll       v13.8H, v22.8B, #2  
+        add         v15.8H, v15.8H, v14.8H 
+        add         v13.8H, v13.8H, v12.8H 
+        sub         \out\().8H, \out\().8H, v15.8H
+        add         \out\().8H, \out\().8H, v13.8H
+.endm
+
+.macro  hevc_put_qpel_vX_neon_8 filter
+        sub     x2, x2, x3, lsl #1
+        sub     x2, x2, x3
+        mov     x12, x4
+        mov     x6, x0
+        mov     x7, x2
+        lsl     x1, x1, #1
+0:      loadin8
+        cmp     x5, #4
+        b.eq    4f
+8:      subs    x4, x4, #1
+        \filter
+        st1         {v7.2D}, [x0], x1
+        regshuffle_d8
+        ld1         {v23.1D}, [x2], x3
+        b.ne    8b
+        subs    x5, x5, #8
+        b.eq    99f
+        mov     x4, x12
+        add     x6, x6, #16
+        mov     x0, x6
+        add     x7, x7, #8
+        mov     x2, x7
+        b       0b
+4:      subs    x4, x4, #1
+        \filter
+        st1         {v7.1D}, [x0], x1
+        regshuffle_d8
+        ld1         {v23.S}[0], [x2], x3
+        b.ne    4b
+99:     ret
+.endm
+
+.macro  hevc_put_qpel_uw_vX_neon_8 filter
+        sub     x2, x2, x3, lsl #1
+        sub     x2, x2, x3
+        mov     x12, x5
+        mov     x13, x0
+        mov     x14, x2
+        cmp     x6, #0
+        b.ne    .Lbi\@
+0:      loadin8
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        \filter
+        sqrshrun    v0.8B, v7.8H, #6
+        st1         {v0.1D}, [x0], x1
+        regshuffle_d8
+        ld1         {v23.1D}, [x2], x3
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x13, x13, #8
+        add     x14, x14, #8
+        mov     x5, x12
+        mov     x0, x13
+        mov     x2, x14
+        b       0b
+4:      subs    x5, x5, #1
+        \filter
+        sqrshrun    v0.8B, v7.8H, #6
+        st1         {v0.S}[0], [x0], x1
+        regshuffle_d8
+        ld1         {v23.S}[0], [x2], x3
+        b.ne    4b
+        ret
+.Lbi\@:
+        lsl     x7, x7, #1
+        mov     x15, x6
+0:      loadin8
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        \filter
+        ld1         {v0.2D}, [x6], x7
+        sqadd       v0.8H, v0.8H, v7.8H
+        sqrshrun    v0.8B, v0.8H, #7
+        st1         {v0.1D}, [x0], x1
+        regshuffle_d8
+        ld1         {v23.1D}, [x2], x3
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x13, x13, #8
+        add     x15, x15, #16
+        add     x14, x14, #8
+        mov     x5, x12
+        mov     x0, x13
+        mov     x6, x15
+        mov     x2, x14
+        b       0b
+4:      subs    x5, x5, #1
+        \filter
+        ld1         {v0.1D}, [x6], x7
+        sqadd       v0.4H, v0.4H, v7.4H
+        sqrshrun    v0.8B, v0.8H, #7
+        st1         {v0.S}[0], [x0], x1
+        regshuffle_d8
+        ld1         {v23.S}[0], [x2], x3
+        b.ne    4b
+99:     ret
+.endm
+
+.macro  hevc_put_qpel_uw_weight_vX_neon_8 filter
+        ldp     w8, w9, [sp]
+        ldp     w10, w11, [sp, #8]
+        mov     w12, #7
+        sub     w12, w12, w7
+        lsl     w8, w8, w12
+        lsl     w9, w9, w12
+        sub     x2, x2, x3, lsl #1
+        sub     x2, x2, x3
+        dup         v0.8H, w10
+        dup         v1.8H, w8
+        mov     x12, x0
+        mov     x13, x2
+        mov     x14, x5
+        cmp     x6, #0
+        b.ne    .Lbi\@
+
+0:      loadin8
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        \filter
+        smull       v14.4S, v1.4H, v7.4H
+        smull2      v15.4S, v1.8H, v7.8H
+        rshrn       v8.4H, v14.4S, #13
+        rshrn2      v8.8H, v15.4S, #13
+        sqadd       v8.8H, v8.8H, v0.8H
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.1D}, [x0], x1
+        prfm PLDL1STRM,   [x2]
+        regshuffle_d8
+        ld1         {v23.1D}, [x2], x3
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x12, x12, #8
+        add     x13, x13, #8
+        mov     x0, x12
+        mov     x2, x13
+        mov     x5, x14
+        b       0b
+4:      subs    x5, x5, #1
+        \filter
+        smull       v14.4S, v1.4H, v7.4H
+        rshrn       v8.4H, v14.4S, #13
+        sqadd       v8.8H, v8.8H, v0.8H
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.S}[0], [x0], x1
+        prfm PLDL1STRM,   [x2]
+        regshuffle_d8
+        ld1         {v23.S}[0], [x2], x3
+        b.ne    4b
+        ret
+.Lbi\@:
+        add     w10, w10, w11
+        add     w10, w10, #1
+        lsl     w10, w10, #13
+        dup     v0.4S, w10
+        dup     v2.8H, w9
+        mov     x7, MAX_PB_DOUBLESIZE
+        mov     x11, x6
+0:      loadin8
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        \filter
+        ld1     {v4.2D}, [x6], x7
+        smull       v14.4S, v2.4H, v7.4H
+        smull2      v15.4S, v2.8H, v7.8H
+        smull       v12.4S, v1.4H, v4.4H
+        smull2      v13.4S, v1.8H, v4.8H
+        add         v14.4S, v14.4S, v12.4S
+        add         v15.4S, v15.4S, v13.4S
+        add         v14.4S, v14.4S, v0.4S
+        add         v15.4S, v15.4S, v0.4S
+        shrn        v8.4H, v14.4S, #14
+        shrn2       v8.8H, v14.4S, #14
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.1D}, [x0], x1
+        prfm PLDL1STRM,   [x2]
+        regshuffle_d8
+        ld1         {v23.1D}, [x2], x3
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x11, x11, #16
+        add     x12, x12, #8
+        add     x13, x13, #8
+        mov     x6, x11
+        mov     x0, x12
+        mov     x2, x13
+        mov     x5, x14
+        b       0b
+4:      subs    x5, x5, #1
+        \filter
+        ld1         {v4.1D}, [x6], x7
+        smull       v14.4S, v2.4H, v7.4H
+        smull       v12.4S, v1.4H, v4.4H
+        add         v14.4S, v14.4S, v12.4S
+        add         v14.4S, v14.4S, v0.4S
+        shrn        v8.4H, v14.4S, #14
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.S}[0], [x0], x1
+        prfm PLDL1STRM,   [x2]
+        regshuffle_d8
+        ld1         {v23.S}[0], [x2], x3
+        b.ne    4b
+99:     ret
+.endm
+
+function ff_hevc_put_qpel_v1_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_v2_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_v3_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_3
+endfunc
+
+function ff_hevc_put_qpel_uw_v1_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_v2_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_v3_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_v1_neon_8, export=1
+	hevc_put_qpel_uw_weight_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_v2_neon_8, export=1
+	hevc_put_qpel_uw_weight_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_v3_neon_8, export=1
+	hevc_put_qpel_uw_weight_vX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hX_neon_8 filter
+        sub     x2, x2, #4
+        lsl     x1, x1, #1
+        mov     x12, x4
+        mov     x6, x0
+        mov     x7, x2
+        cmp     x5, #4
+        b.eq    4f
+8:      subs    x4, x4, #1
+        vextin8
+        \filter
+        st1         {v7.2D}, [x0], x1
+        b.ne    8b
+        subs    x5, x5, #8
+        b.eq    99f
+        mov     x4, x12
+        add     x6, x6, #16
+        mov     x0, x6
+        add     x7, x7, #8
+        mov     x2, x7
+        cmp     x5, #4
+        b.ne    8b
+4:      subs    x4, x4, #2
+        vextin8_4
+        \filter
+        st1         {v7.D}[0], [x0], x1
+        st1         {v7.D}[1], [x0], x1
+        b.ne    4b
+99:     ret
+.endm
+
+.macro hevc_put_qpel_uw_hX_neon_8 filter
+        sub     x2, x2, #4
+        mov     x12, x5
+        mov     x13, x0
+        mov     x14, x2
+        cmp     x6, #0
+        b.ne    .Lbi\@
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        vextin8
+        \filter
+        sqrshrun    v0.8B, v7.8H, #6
+        st1         {v0.1D}, [x0], x1
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x13, x13, #8
+        add     x14, x14, #8
+        mov     x5, x12
+        mov     x0, x13
+        mov     x2, x14
+        cmp     x4, #4
+        b.ne    8b
+4:      subs    x5, x5, #2
+        vextin8_4
+        \filter
+        sqrshrun    v0.8B, v7.8H, #6
+        st1         {v0.S}[0], [x0], x1
+        st1         {v0.S}[1], [x0], x1
+        b.ne    4b
+        ret
+.Lbi\@:
+        lsl     x7, x7, #1
+        cmp     x4, #4
+        b.eq    4f
+        mov     x15, x6
+8:      subs    x5, x5, #1
+        vextin8
+        \filter
+        ld1         {v0.2D}, [x6], x7
+        sqadd      v0.8H, v0.8H, v7.8H
+        sqrshrun    v0.8B, v0.8H, #7
+        st1         {v0.1D}, [x0]
+        add     x0, x0, x1
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x15, x15, #16
+        add     x13, x13, #8
+        add     x14, x14, #8
+        mov     x5, x12
+        mov     x6, x15
+        mov     x2, x14
+        mov     x0, x13
+        cmp     x4, #4
+        b.ne    8b
+4:      subs    x5, x5, #2
+        vextin8_4
+        \filter
+        ld1         {v0.1D}, [x6], x7
+        ld1         {v0.D}[1], [x6], x7
+        sqadd      v0.8H, v0.8H, v7.8H
+        sqrshrun    v0.8B, v0.8H, #7
+        st1         {v0.S}[0], [x0], x1
+        st1         {v0.S}[1], [x0], x1
+        b.ne    4b
+99:     ret
+.endm
+
+.macro  hevc_put_qpel_uw_weight_hX_neon_8 filter
+        ldp     w8, w9, [sp]
+        ldp     w10, w11, [sp, #8]
+        mov     w12, #7
+        sub     w12, w12, w7
+        sub     x2, x2, #4
+        lsl     w8, w8, w12
+        lsl     w9, w9, w12
+        dup         v0.8H, w10
+        dup         v1.8H, w8
+        mov     x12, x0
+        mov     x13, x2
+        mov     x14, x5
+        cmp     x6, #0
+        b.ne    .Lbi\@
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        vextin8
+        \filter                          
+        smull       v14.4S, v1.4H, v7.4H
+        smull2      v15.4S, v1.8H, v7.8H
+        rshrn       v8.4H, v14.4S, #13
+        rshrn2      v8.8H, v15.4S, #13
+        sqadd       v8.8H, v8.8H, v0.8H
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.1D}, [x0], x1
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x12, x12, #8
+        add     x13, x13, #8
+        mov     x0, x12
+        mov     x2, x13
+        mov     x5, x14
+        cmp     x4, #4
+        b.ne    8b
+4:      subs    x5, x5, #2
+        vextin8_4
+        \filter
+        smull       v14.4S, v1.4H, v7.4H
+        smull2      v15.4S, v1.8H, v7.8H
+        rshrn       v8.4H, v14.4S, #13
+        rshrn2      v8.8H, v15.4S, #13
+        sqadd       v8.8H, v8.8H, v0.8H
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.S}[0], [x0], x1
+        st1         {v8.S}[1], [x0], x1
+        b.ne    4b
+        ret
+.Lbi\@:
+        add     w10, w10, w11
+        add     w10, w10, #1
+        lsl     w10, w10, #13
+        dup         v0.4S, w10
+        dup         v1.8H, w9
+        dup         v2.8H, w8
+        mov     x7, MAX_PB_DOUBLESIZE
+        cmp     x4, #4
+        b.eq    4f
+        mov     x11, x6
+8:      subs    x5, x5, #1
+        vextin8
+        \filter
+        ld1         {v4.2D}, [x6], x7
+        smull       v14.4S, v1.4H, v7.4H
+        smull2      v15.4S, v1.8H, v7.8H
+        smull       v12.4S, v2.4H, v4.4H
+        smull2      v13.4S, v2.8H, v4.8H
+        add         v14.4S, v14.4S, v12.4S
+        add         v15.4S, v15.4S, v13.4S
+        add         v14.4S, v14.4S, v0.4S
+        add         v15.4S, v15.4S, v0.4S
+        shrn        v8.4H, v14.4S, #14
+        shrn2       v8.8H, v14.4S, #14
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.1D}, [x0], x1
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x11, x11, #16
+        add     x12, x12, #8
+        add     x13, x13, #8
+        mov     x6, x11
+        mov     x0, x12
+        mov     x2, x13
+        mov     x5, x14
+        cmp     x4, #4
+        b.ne    8b
+4:      subs    x5, x5, #2
+        vextin8_4
+        \filter
+        ld1         {v4.D}[0], [x6], x7
+        ld1         {v4.D}[1], [x6], x7
+        smull       v14.4S, v1.4H, v7.4H
+        smull2      v15.4S, v1.8H, v7.8H
+        smull       v12.4S, v2.4H, v4.4H
+        smull2      v13.4S, v2.8H, v4.8H
+        add         v14.4S, v14.4S, v12.4S
+        add         v15.4S, v15.4S, v13.4S
+        shrn        v8.4H, v14.4S, #14
+        shrn2       v8.8H, v15.4S, #14
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.S}[0], [x0], x1
+        st1         {v8.S}[1], [x0], x1
+        b.ne        4b
+99:     ret
+.endm
+
+function ff_hevc_put_qpel_h1_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_h2_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_h3_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_3
+endfunc
+
+function ff_hevc_put_qpel_uw_h1_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_h2_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_h3_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h1_neon_8, export=1
+        hevc_put_qpel_uw_weight_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h2_neon_8, export=1        
+        hevc_put_qpel_uw_weight_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h3_neon_8, export=1        
+        hevc_put_qpel_uw_weight_hX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
+        sub     x2, x2, #4 
+        sub     x2, x2, x3, lsl #1 
+        sub     x2, x2, x3
+        lsl     x1, x1, #1
+        mov     x12, x4
+        mov     x6, x0
+        mov     x7, x2
+0:      vextin8                                       
+        \filterh    v0
+        vextin8
+        \filterh    v1
+        vextin8
+        \filterh    v2
+        vextin8
+        \filterh    v3
+        vextin8
+        \filterh    v4
+        vextin8
+        \filterh    v5
+        vextin8
+        \filterh    v6
+        vextin8
+        \filterh    v7
+        cmp     x5, #4
+        b.eq    4f
+8:      subs    x4, x4, #1
+        \filterv
+        st1         {v8.2D}, [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    8b
+        subs    x5, x5, #8
+        b.eq    99f
+        mov     x4, x12
+        add     x6, x6, #16
+        mov     x0, x6
+        add     x7, x7, #8
+        mov     x2, x7
+        b       0b
+4:      subs    x4, x4, #1
+        \filterv
+        st1         {v8.1D}, [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    4b
+99:     ret
+.endm
+
+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
+        sub     x2, x2, #4
+        sub     x2, x2, x3, lsl #1
+        sub     x2, x2, x3
+        mov     x12, x5
+        mov     x13, x0
+        mov     x14, x2
+        cmp     x6, #0
+        b.ne    .Lbi\@
+0:      vextin8
+        \filterh    v0
+        vextin8
+        \filterh    v1
+        vextin8
+        \filterh    v2
+        vextin8
+        \filterh    v3
+        vextin8
+        \filterh    v4
+        vextin8
+        \filterh    v5
+        vextin8
+        \filterh    v6
+        vextin8
+        \filterh    v7
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        \filterv
+        sqrshrun    v0.8B, v8.8H, #6
+        st1         {v0.1D}, [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq     99f
+        add     x13, x13, #8
+        add     x14, x14, #8
+        mov     x5, x12
+        mov     x0, x13
+        mov     x2, x14
+        b       0b
+4:      subs    x5, x5, #1
+        \filterv
+        sqrshrun    v0.8B, v8.8H, #6
+        st1         {v0.S}[0], [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    4b
+        ret
+.Lbi\@:
+        lsl     x7, x7, #1
+        mov     x15, x6
+0:      vextin8
+        \filterh    v0
+        vextin8
+        \filterh    v1
+        vextin8
+        \filterh    v2
+        vextin8
+        \filterh    v3
+        vextin8
+        \filterh    v4
+        vextin8
+        \filterh    v5
+        vextin8
+        \filterh    v6
+        vextin8
+        \filterh    v7
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        \filterv
+        ld1         {v0.2D}, [x6], x7
+        sqadd       v0.8H, v0.8H, v8.8H
+        sqrshrun    v0.8B, v0.8H, #7
+        st1         {v0.1D}, [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x13, x13, #8
+        add     x14, x14, #8
+        add     x15, x15, #16
+        mov     x5, x12
+        mov     x0, x13
+        mov     x2, x14
+        mov     x6, x15
+        b       0b
+4:      subs    x5, x5, #1
+        \filterv
+        ld1         {v0.1D}, [x6], x7
+        sqadd       v0.4H, v0.4H, v8.4H
+        sqrshrun    v0.8B, v0.8H, #7
+        st1         {v0.S}[0], [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    4b
+99:     ret
+.endm
+
+.macro hevc_put_qpel_uw_weight_hXvY_neon_8 filterh filterv
+        ldp     w8, w9, [sp]
+        ldp     w10, w11, [sp, #8]
+        mov     w12, #7
+        sub     w12, w12, w7
+        lsl     w8, w8, w12
+        lsl     w9, w9, w12
+        dup         v28.8H, w10
+        dup         v29.8H, w8
+        sub     x2, x2, #4
+        sub     x2, x2, x3, lsl #1
+        sub     x2, x2, x3
+        mov     x12, x0
+        mov     x13, x2
+        mov     x14, x5
+        cmp     x6, #0
+        b.ne    .Lbi\@
+0:      vextin8
+        \filterh    v0
+        vextin8
+        \filterh    v1
+        vextin8
+        \filterh    v2
+        vextin8
+        \filterh    v3
+        vextin8
+        \filterh    v4
+        vextin8
+        \filterh    v5
+        vextin8
+        \filterh    v6
+        vextin8
+        \filterh    v7
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        \filterv
+        smull       v14.4S, v29.4H, v8.4H
+        smull2      v15.4S, v29.8H, v8.8H
+        rshrn       v8.4H, v14.4S, #13
+        rshrn2      v8.8H, v15.4S, #13
+        sqadd       v8.8H, v8.8H, v28.8H
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.1D}, [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x12, x12, #8
+        add     x13, x13, #8
+        mov     x0, x12
+        mov     x2, x13
+        mov     x5, x14
+        b       0b
+4:      subs    x5, x5, #1
+        \filterv
+        smull       v14.4S, v29.4H, v8.4H
+        rshrn       v8.4H, v14.4S, #13
+        sqadd       v8.8H, v8.8H, v28.8H
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.S}[0], [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    4b
+        ret
+.Lbi\@:
+        add     w10, w10, w11
+        add     w10, w10, #1
+        lsl     w10, w10, #13
+        dup         v28.4S, w10
+        dup         v30.8H, w9
+        mov     x7, MAX_PB_DOUBLESIZE
+        mov     x11, x6
+0:      vextin8
+        \filterh    v0
+        vextin8
+        \filterh    v1
+        vextin8
+        \filterh    v2
+        vextin8
+        \filterh    v3
+        vextin8
+        \filterh    v4
+        vextin8
+        \filterh    v5
+        vextin8
+        \filterh    v6
+        vextin8
+        \filterh    v7
+        cmp     x4, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        \filterv
+        ld1         {v0.2D}, [x6], x7
+        smull       v14.4S, v30.4H, v8.4H
+        smull2      v15.4S, v30.8H, v8.8H
+        smull       v12.4S, v29.4H, v0.4H
+        smull2      v13.4S, v29.8H, v0.8H
+        add         v14.4S, v14.4S, v12.4S
+        add         v15.4S, v15.4S, v13.4S
+        add         v14.4S, v14.4S, v28.4S
+        add         v15.4S, v15.4S, v28.4S
+        rshrn       v8.4H, v14.4S, #14
+        rshrn2      v8.8H, v15.4S, #14
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.1D}, [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    8b
+        subs    x4, x4, #8
+        b.eq    99f
+        add     x11, x11, #16
+        add     x12, x12, #8
+        add     x13, x13, #8
+        mov     x6, x11
+        mov     x0, x12
+        mov     x2, x13
+        mov     x5, x14
+        b       0b
+4:      subs    x5, x5, #1
+        ld1         {v0.1D}, [x6], x7
+        smull       v14.4S, v30.4H, v8.4H
+        smull       v12.4S, v29.4H, v0.4H
+        add         v14.4S, v14.4S, v12.4S
+        add         v14.4S, v14.4S, v28.4S
+        rshrn       v8.4H, v14.4S, #14
+        sqxtun      v8.8B, v8.8H
+        st1         {v8.S}[0], [x0], x1
+        regshuffle_v8
+        vextin8
+        \filterh    v7
+        b.ne    4b
+99:     ret
+.endm
+
+function ff_hevc_put_qpel_h1v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h1v1_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h2v1_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h3v1_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h1v2_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h2v2_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h3v2_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h1v3_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h2v3_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_weight_h3v3_neon_8, export=1
+        hevc_put_qpel_uw_weight_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uni_w_neon_8, export=1
+        ldr     w10, [sp, #16]                              
+        mov     w11, #7
+        sub     w5, w11, w5                               
+        lsl     w6, w6, w5                                 
+        dup         v12.16B, w6
+        dup         v14.8H, w7
+        mov     x12, x4
+        mov     x13, x0
+        mov     x14, x2
+        cmp     w10, #4
+        b.eq    4f
+8:      subs    x4, x4, #1 
+        ld1         {v0.1D}, [x2], x3
+        umull       v8.8H, v0.8B, v12.8B
+        urshr       v8.8H, v8.8H, #7
+        usqadd      v8.8H, v14.8H
+        uqxtn       v0.8B, v8.8H
+        st1         {v0.1D}, [x0], x1
+        b.ne    8b
+        subs    w10, w10, #8
+        b.eq    99f
+        add     x13, x13, #8
+        add     x14, x14, #8
+        mov     x4, x12
+        mov     x0, x13
+        mov     x2, x14
+        cmp     w10, #4
+        b.ne    8b
+4:      subs    x4, x4, #2
+        ld1         {v0.S}[0], [x2], x3
+        ld1         {v0.S}[1], [x2], x3
+        umull       v8.8H, v0.8B, v12.8B
+        urshr       v8.8H, v8.8H, #7
+        usqadd      v8.8H, v14.8H
+        uqxtn       v0.8B, v8.8H
+        st1         {v0.S}[0], [x0], x1
+        st1         {v0.S}[1], [x0], x1
+        b.ne    4b
+99:     ret
+endfunc
+
+function ff_hevc_put_qpel_bi_w_neon_8, export=1
+        ldp     w8, w9, [sp]
+        ldr     w10, [sp, #8]
+        mov     w11, #7
+        sub     w11, w11, w6
+        lsl     w7, w7, w11
+        lsl     w8, w8, w11
+        ldr     w6, [sp, #32]
+        add     w11, w9, w10
+        add     w11, w11, #1
+        lsl     w11, w11, #13
+        dup         v12.8H, w7
+        dup         v13.8H, w8
+        dup         v14.4S, w11
+        mov     x7, MAX_PB_DOUBLESIZE
+        mov     x10, x4
+        mov     x11, x0
+        mov     x12, x5
+        mov     x13, x2
+        cmp     w6, #4
+        b.eq    4f
+8:      subs    x5, x5, #1
+        ld1         {v0.1D}, [x2], x3
+        ld1         {v1.2D}, [x4], x7
+        ushll       v0.8H, v0.8B, #6
+        smull       v4.4S, v0.4H, v13.4H
+        smull2      v5.4S, v0.8H, v13.8H
+        smull       v6.4S, v1.4H, v12.4H
+        smull2      v7.4S, v1.8H, v12.8H
+        add         v4.4S, v4.4S, v6.4S
+        add         v5.4S, v5.4S, v7.4S
+        add         v4.4S, v4.4S, v14.4S
+        add         v5.4S, v5.4S, v14.4S
+        shrn        v0.4H, v4.4S, #14
+        shrn2       v0.8H, v5.4S, #14
+        sqxtun      v0.8B, v0.8H
+        st1         {v0.1D}, [x0], x1
+        b.ne    8b
+        subs    w6, w6, #8
+        b.eq    99f
+        add     x11, x11, #8
+        add     x10, x10, #16
+        add     x13, x13, #8
+        mov     x4, x10
+        mov     x0, x11
+        mov     x5, x12
+        mov     x2, x13
+        cmp     w6, #4
+        b.ne    8b
+4:      subs    x5, x5, #2
+        ld1         {v0.S}[0], [x2], x3
+        ld1         {v2.S}[0], [x2], x3
+        ld1         {v1.1D}, [x4], x7
+        ld1         {v3.1D}, [x4], x7
+        ushll       v0.8H, v0.8B, #6
+        ushll       v2.8H, v2.8B, #6
+        smull       v4.4S, v0.4H, v13.4H
+        smull       v6.4S, v1.4H, v12.4H
+        smull       v5.4S, v2.4H, v13.4H
+        smull       v7.4S, v3.4H, v12.4H
+        add         v4.4S, v4.4S, v6.4S
+        add         v4.4S, v4.4S, v14.4S
+        add         v5.4S, v5.4S, v7.4S
+        add         v5.4S, v5.4S, v14.4S
+        shrn        v0.4H, v4.4S, #14
+        shrn        v1.4H, v5.4S, #14
+        sqxtun      v0.8B, v0.8H
+        sqxtun      v1.8B, v1.8H
+        st1         {v0.S}[0], [x0], x1
+        st1         {v1.S}[0], [x0], x1
+        b.ne    4b
+99:     ret
+endfunc
-- 
2.3.2 (Apple Git-55)




From 3f13286178f46ef629184322d1b7d112cbbaba8a Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Wed, 27 Jan 2016 22:45:46 +0800
Subject: [PATCH 10/12] Update hevcdsp_init_neon.c


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/aarch64/hevcdsp_init_neon.c | 202 +++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)


diff --git a/libavcodec/aarch64/hevcdsp_init_neon.c b/libavcodec/aarch64/hevcdsp_init_neon.c
index 0a3b2e5..050f69b 100644
--- a/libavcodec/aarch64/hevcdsp_init_neon.c
+++ b/libavcodec/aarch64/hevcdsp_init_neon.c
@@ -41,6 +41,133 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
                                       ptrdiff_t stride);
 void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
                                       ptrdiff_t stride);
+#define PUT_PIXELS(name) \
+void name(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, int height, \
+intptr_t mx, intptr_t my, int width)
+PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+#undef PUT_PIXELS
+
+static  void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                         int height, int width);
+static  void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                         int height, int width, int16_t *src2, ptrdiff_t src2stride);
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+#define QPEL_FUNC(name) \
+    void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+                int height, int width)
+QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#define QPEL_FUNC_UW(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+            int width, int height, int16_t* src2, ptrdiff_t src2stride)
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+#undef QPEL_FUNC_UW
+
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width){
+    put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
+}
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width){
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int16_t *src2, int height, intptr_t mx, intptr_t my, int width){
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+#define QPEL_FUNC_UW_WEIGHT(name) \
+void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+int width, int height, int16_t* src2, \
+int denom, int wx0, int wx1, int ox0, int ox1);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_v1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_v2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_v3_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h3_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h1v1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h1v2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h1v3_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h2v1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h2v2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h2v3_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h3v1_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h3v2_neon_8);
+QPEL_FUNC_UW_WEIGHT(ff_hevc_put_qpel_uw_weight_h3v3_neon_8);
+#undef QPEL_FUNC_UW_WEIGHT
+
+static void (*put_hevc_qpel_uw_weight_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                                  int width, int height, int16_t* src2,
+                                                  int denom, int wx0, int wx1, int ox0, int ox1);
+
+static void ff_hevc_put_qpel_uni_weight_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                                     int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    put_hevc_qpel_uw_weight_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, denom, wx, 0, ox, 0);
+}
+
+static void ff_hevc_put_qpel_bi_weight_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                                    int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1,
+                                                    intptr_t mx, intptr_t my, int width)
+{
+    put_hevc_qpel_uw_weight_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, denom, wx0, wx1, ox0, ox1);
+}
+
+void ff_hevc_put_qpel_bi_w_neon_8(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *src1, ptrdiff_t _srcstride,
+                                 int16_t *src2, int height, int denom, int wx0, int wx1,
+                                 int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_w_neon_8(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *src1, ptrdiff_t _srcstride,
+                                  int height, int denom, int wx0,
+                                  int ox0, intptr_t mx, intptr_t my, int width);
 
 
 av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
@@ -60,5 +187,80 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
         c->transform_add[2]                 = ff_hevc_transform_add_16x16_neon_8;
         c->transform_add[3]                 = ff_hevc_transform_add_32x32_neon_8;
         c->idct_4x4_luma                    = ff_hevc_transform_luma_4x4_neon_8;
+        
+        put_hevc_qpel_neon[1][0]            = ff_hevc_put_qpel_v1_neon_8;
+        put_hevc_qpel_neon[2][0]            = ff_hevc_put_qpel_v2_neon_8;
+        put_hevc_qpel_neon[3][0]            = ff_hevc_put_qpel_v3_neon_8;
+        put_hevc_qpel_neon[0][1]            = ff_hevc_put_qpel_h1_neon_8;
+        put_hevc_qpel_neon[0][2]            = ff_hevc_put_qpel_h2_neon_8;
+        put_hevc_qpel_neon[0][3]            = ff_hevc_put_qpel_h3_neon_8;
+        put_hevc_qpel_neon[1][1]            = ff_hevc_put_qpel_h1v1_neon_8;
+        put_hevc_qpel_neon[1][2]            = ff_hevc_put_qpel_h2v1_neon_8;
+        put_hevc_qpel_neon[1][3]            = ff_hevc_put_qpel_h3v1_neon_8;
+        put_hevc_qpel_neon[2][1]            = ff_hevc_put_qpel_h1v2_neon_8;
+        put_hevc_qpel_neon[2][2]            = ff_hevc_put_qpel_h2v2_neon_8;
+        put_hevc_qpel_neon[2][3]            = ff_hevc_put_qpel_h3v2_neon_8;
+        put_hevc_qpel_neon[3][1]            = ff_hevc_put_qpel_h1v3_neon_8;
+        put_hevc_qpel_neon[3][2]            = ff_hevc_put_qpel_h2v3_neon_8;
+        put_hevc_qpel_neon[3][3]            = ff_hevc_put_qpel_h3v3_neon_8;
+        put_hevc_qpel_uw_neon[1][0]            = ff_hevc_put_qpel_uw_v1_neon_8;
+        put_hevc_qpel_uw_neon[2][0]            = ff_hevc_put_qpel_uw_v2_neon_8;
+        put_hevc_qpel_uw_neon[3][0]            = ff_hevc_put_qpel_uw_v3_neon_8;
+        put_hevc_qpel_uw_neon[0][1]            = ff_hevc_put_qpel_uw_h1_neon_8;
+        put_hevc_qpel_uw_neon[0][2]            = ff_hevc_put_qpel_uw_h2_neon_8;
+        put_hevc_qpel_uw_neon[0][3]            = ff_hevc_put_qpel_uw_h3_neon_8;
+        put_hevc_qpel_uw_neon[1][1]            = ff_hevc_put_qpel_uw_h1v1_neon_8;
+        put_hevc_qpel_uw_neon[1][2]            = ff_hevc_put_qpel_uw_h2v1_neon_8;
+        put_hevc_qpel_uw_neon[1][3]            = ff_hevc_put_qpel_uw_h3v1_neon_8;
+        put_hevc_qpel_uw_neon[2][1]            = ff_hevc_put_qpel_uw_h1v2_neon_8;
+        put_hevc_qpel_uw_neon[2][2]            = ff_hevc_put_qpel_uw_h2v2_neon_8;
+        put_hevc_qpel_uw_neon[2][3]            = ff_hevc_put_qpel_uw_h3v2_neon_8;
+        put_hevc_qpel_uw_neon[3][1]            = ff_hevc_put_qpel_uw_h1v3_neon_8;
+        put_hevc_qpel_uw_neon[3][2]            = ff_hevc_put_qpel_uw_h2v3_neon_8;
+        put_hevc_qpel_uw_neon[3][3]            = ff_hevc_put_qpel_uw_h3v3_neon_8;
+        put_hevc_qpel_uw_weight_neon[1][0]      = ff_hevc_put_qpel_uw_weight_v1_neon_8;
+        put_hevc_qpel_uw_weight_neon[2][0]      = ff_hevc_put_qpel_uw_weight_v2_neon_8;
+        put_hevc_qpel_uw_weight_neon[3][0]      = ff_hevc_put_qpel_uw_weight_v3_neon_8;
+        put_hevc_qpel_uw_weight_neon[0][1]      = ff_hevc_put_qpel_uw_weight_h1_neon_8;
+        put_hevc_qpel_uw_weight_neon[0][2]      = ff_hevc_put_qpel_uw_weight_h2_neon_8;
+        put_hevc_qpel_uw_weight_neon[0][3]      = ff_hevc_put_qpel_uw_weight_h3_neon_8;
+        put_hevc_qpel_uw_weight_neon[1][1]      = ff_hevc_put_qpel_uw_weight_h1v1_neon_8;
+        put_hevc_qpel_uw_weight_neon[1][2]      = ff_hevc_put_qpel_uw_weight_h2v1_neon_8;
+        put_hevc_qpel_uw_weight_neon[1][3]      = ff_hevc_put_qpel_uw_weight_h3v1_neon_8;
+        put_hevc_qpel_uw_weight_neon[2][1]      = ff_hevc_put_qpel_uw_weight_h1v2_neon_8;
+        put_hevc_qpel_uw_weight_neon[2][2]      = ff_hevc_put_qpel_uw_weight_h2v2_neon_8;
+        put_hevc_qpel_uw_weight_neon[2][3]      = ff_hevc_put_qpel_uw_weight_h3v2_neon_8;
+        put_hevc_qpel_uw_weight_neon[3][1]      = ff_hevc_put_qpel_uw_weight_h1v3_neon_8;
+        put_hevc_qpel_uw_weight_neon[3][2]      = ff_hevc_put_qpel_uw_weight_h2v3_neon_8;
+        put_hevc_qpel_uw_weight_neon[3][3]      = ff_hevc_put_qpel_uw_weight_h3v3_neon_8;
+        for (x = 0; x < 10; x++) {
+            c->put_hevc_qpel[x][1][0]       = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][0][1]       = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][1][1]       = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][0]   = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][0][1]   = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][1]   = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][0]    = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][0][1]    = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][1]    = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_uni_w[x][0][0] = ff_hevc_put_qpel_uni_w_neon_8;
+            c->put_hevc_qpel_uni_w[x][1][0] = ff_hevc_put_qpel_uni_weight_neon_wrapper;
+            c->put_hevc_qpel_uni_w[x][0][1] = ff_hevc_put_qpel_uni_weight_neon_wrapper;
+            c->put_hevc_qpel_uni_w[x][1][1] = ff_hevc_put_qpel_uni_weight_neon_wrapper;
+            c->put_hevc_qpel_bi_w[x][0][0]  = ff_hevc_put_qpel_bi_w_neon_8;
+            c->put_hevc_qpel_bi_w[x][1][0]  = ff_hevc_put_qpel_bi_weight_neon_wrapper;
+            c->put_hevc_qpel_bi_w[x][0][1]  = ff_hevc_put_qpel_bi_weight_neon_wrapper;
+            c->put_hevc_qpel_bi_w[x][1][1]  = ff_hevc_put_qpel_bi_weight_neon_wrapper;
+        }       
+        c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+        c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+        c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+        c->put_hevc_qpel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+        c->put_hevc_qpel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+        c->put_hevc_qpel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+        c->put_hevc_qpel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+        c->put_hevc_qpel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+        c->put_hevc_qpel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+        c->put_hevc_qpel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
     }
 }
-- 
2.3.2 (Apple Git-55)
From db9a1abfbbb5fa9075903eb8a119b64233121f0f Mon Sep 17 00:00:00 2001
From: zjh8890 <zjh8890 at users.noreply.github.com>
Date: Wed, 27 Jan 2016 22:56:50 +0800
Subject: [PATCH 12/12] Create hevcdsp_init_aarch64.c


Signed-off-by: zjh8890 <243186085 at qq.com>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c | 33 +++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_init_aarch64.c


diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
new file mode 100644
index 0000000..e8c2802
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -0,0 +1,33 @@
+/*
+ * ARM NEON optimised HEVC decode for aarch64
+ * Copyright (c) 2015 Junhai ZHANG <243186085 at qq.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_aarch64.h"
+
+av_cold void ff_hevcdsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        ff_hevcdsp_init_neon(c, bit_depth);
+}
-- 
2.3.2 (Apple Git-55)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-Create-hevcdsp_init_aarch64.c.patch
Type: application/octet-stream
Size: 1928 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0003-Create-hevcdsp_init_neon.c.patch
Type: application/octet-stream
Size: 3981 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0001.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-Update-Makefile-to-add-Aach64-NEON-optim-HEVC-decode.patch
Type: application/octet-stream
Size: 1571 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0002.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0005-Add-Aarch64-neon-optim-for-HEVC-decoder.patch
Type: application/octet-stream
Size: 823 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0003.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0006-Add-Aarch64-neon-optim-for-HEVC-decoder.patch
Type: application/octet-stream
Size: 979 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0004.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0007-Create-hevcdsp_aarch64.h.patch
Type: application/octet-stream
Size: 1792 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0005.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0008-Add-qpel-neon-optimization-for-HEVC-decoder.patch
Type: application/octet-stream
Size: 1044 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0006.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0012-Create-hevcdsp_init_aarch64.c.patch
Type: application/octet-stream
Size: 1910 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0007.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0009-Create-hevcdsp_qpel_neon.S.patch
Type: application/octet-stream
Size: 43767 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0008.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0010-Update-hevcdsp_init_neon.c.patch
Type: application/octet-stream
Size: 14286 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0009.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-Create-hevcdsp_idct_neon.S-for-aarch64.patch
Type: application/octet-stream
Size: 48246 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160128/0950c09e/attachment-0010.obj>