[FFmpeg-devel] [PATCH 3/3] h264: new assembly version of get_cabac for x86_64 with PIC

Roland Scheidegger rscheidegger_lists at hispeed.ch
Fri Apr 27 03:45:26 CEST 2012


This adds a hand-optimized assembly version for get_cabac much like the
existing one, but it works if the table offsets are RIP-relative.
Compared to the non-RIP-relative version this adds 2 lea instructions
and it needs one extra register.
There is a surprisingly large performance improvement over the c version (more
so than the generated assembly seems to suggest) just in get_cabac, I measured
roughly 40% faster for get_cabac on a K8. However, overall the difference is
not that big, I measured roughly 5% on a test clip on a K8 and a Core2.
Hopefully it still compiles on x86 32bit...
Now that only one table is used, there's some chance even darwin as compiles
this (apparently the label arithmetic used previously doesn't work if it
involves symbols defined in a different file, thanks to Ronald S. Bultje for
helping me with this).
---
 libavcodec/h264_cabac.c    |    2 +-
 libavcodec/x86/cabac.h     |   88 ++++++++++++++++++++++++++++++++++++++++----
 libavcodec/x86/h264_i386.h |   53 ++++++++++++++++++--------
 3 files changed, 118 insertions(+), 25 deletions(-)

diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 29dbd7a..d85ebf5 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1652,7 +1652,7 @@ decode_cabac_residual_internal(H264Context *h, DCTELEM *block,
             index[coeff_count++] = last;\
         }
         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
-#if ARCH_X86 && HAVE_7REGS && !defined(BROKEN_RELOCATIONS)
+#if ARCH_X86 && HAVE_7REGS
         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index,
                                                  last_coeff_ctx_base, sig_off);
     } else {
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index e112f67..ca42eae 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -27,6 +27,68 @@
 #include "libavutil/internal.h"
 #include "config.h"
 
+#ifdef BROKEN_RELOCATIONS
+#define TABLES_ARG , "r"(tables)
+
+#if HAVE_FAST_CMOV
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+        "cmp    "low"       , "tmp"                        \n\t"\
+        "cmova  %%ecx       , "range"                      \n\t"\
+        "sbb    %%rcx       , %%rcx                        \n\t"\
+        "and    %%ecx       , "tmp"                        \n\t"\
+        "xor    %%rcx       , "retq"                       \n\t"\
+        "sub    "tmp"       , "low"                        \n\t"
+#else /* HAVE_FAST_CMOV */
+#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
+/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \
+        "sub    "low"       , "tmp"                        \n\t"\
+        "sar    $31         , "tmp"                        \n\t"\
+        "sub    %%ecx       , "range"                      \n\t"\
+        "and    "tmp"       , "range"                      \n\t"\
+        "add    %%ecx       , "range"                      \n\t"\
+        "shl    $17         , %%ecx                        \n\t"\
+        "and    "tmp"       , %%ecx                        \n\t"\
+        "sub    %%ecx       , "low"                        \n\t"\
+        "xor    "tmp"       , "ret"                        \n\t"\
+        "movslq "ret"       , "retq"                       \n\t"
+#endif /* HAVE_FAST_CMOV */
+
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
+        "movzbl "statep"    , "ret"                                     \n\t"\
+        "mov    "range"     , "tmp"                                     \n\t"\
+        "and    $0xC0       , "range"                                   \n\t"\
+        "lea    ("ret", "range", 2), %%ecx                              \n\t"\
+        "movzbl "lps_off"("tables", %%rcx), "range"                     \n\t"\
+        "sub    "range"     , "tmp"                                     \n\t"\
+        "mov    "tmp"       , %%ecx                                     \n\t"\
+        "shl    $17         , "tmp"                                     \n\t"\
+        BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp)              \
+        "movzbl "norm_off"("tables", "rangeq"), %%ecx                   \n\t"\
+        "shl    %%cl        , "range"                                   \n\t"\
+        "movzbl "mlps_off"+128("tables", "retq"), "tmp"                 \n\t"\
+        "shl    %%cl        , "low"                                     \n\t"\
+        "mov    "tmpbyte"   , "statep"                                  \n\t"\
+        "test   "lowword"   , "lowword"                                 \n\t"\
+        "jnz    2f                                                      \n\t"\
+        "mov    "byte"      , %%"REG_c"                                 \n\t"\
+        "add"OPSIZE" $2     , "byte"                                    \n\t"\
+        "movzwl (%%"REG_c") , "tmp"                                     \n\t"\
+        "lea    -1("low")   , %%ecx                                     \n\t"\
+        "xor    "low"       , %%ecx                                     \n\t"\
+        "shr    $15         , %%ecx                                     \n\t"\
+        "bswap  "tmp"                                                   \n\t"\
+        "shr    $15         , "tmp"                                     \n\t"\
+        "movzbl "norm_off"("tables", %%rcx), %%ecx                      \n\t"\
+        "sub    $0xFFFF     , "tmp"                                     \n\t"\
+        "neg    %%ecx                                                   \n\t"\
+        "add    $7          , %%ecx                                     \n\t"\
+        "shl    %%cl        , "tmp"                                     \n\t"\
+        "add    "tmp"       , "low"                                     \n\t"\
+        "2:                                                             \n\t"
+
+#else /* BROKEN_RELOCATIONS */
+#define RIP_ARG
+
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
         "mov    "tmp"       , %%ecx     \n\t"\
@@ -52,7 +114,7 @@
         "xor    "tmp"       , "ret"     \n\t"
 #endif /* HAVE_FAST_CMOV */
 
-#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off) \
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
         "movzbl "statep"    , "ret"                                     \n\t"\
         "mov    "range"     , "tmp"                                     \n\t"\
         "and    $0xC0       , "range"                                   \n\t"\
@@ -82,31 +144,41 @@
         "add    "tmp"       , "low"                                     \n\t"\
         "2:                                                             \n\t"
 
+#endif /* BROKEN_RELOCATIONS */
 
-#if HAVE_7REGS && !defined(BROKEN_RELOCATIONS) && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
-                                               && !(defined(__i386) && !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
+
+#if HAVE_7REGS && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+               && !(defined(__i386) && !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
 #define get_cabac_inline get_cabac_inline_x86
 static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                                                  uint8_t *const state)
 {
     int bit, tmp;
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+    );
+#endif
 
     __asm__ volatile(
-        BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1",
-                             "%2", "%3", "%b3",
-                             "%a6(%5)", "%a7(%5)", "%a8", "%a9", "%a10")
+        BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
+                             "%2", "%q2", "%3", "%b3",
+                             "%a6(%5)", "%a7(%5)", "%a8", "%a9", "%a10", "%11")
         : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
         : "r"(state), "r"(c),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end)),
           "i"(H264_NORM_SHIFT_OFFSET),
           "i"(H264_LPS_RANGE_OFFSET),
-          "i"(H264_MLPS_STATE_OFFSET)
+          "i"(H264_MLPS_STATE_OFFSET) TABLES_ARG
         : "%"REG_c, "memory"
     );
     return bit & 1;
 }
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif /* HAVE_7REGS */
 
 #define get_cabac_bypass_sign get_cabac_bypass_sign_x86
 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index d278708..2a502b7 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -36,7 +36,7 @@
 
 //FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
 //as that would make optimization work hard)
-#if HAVE_7REGS && !defined(BROKEN_RELOCATIONS)
+#if HAVE_7REGS
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
                                    int *index, x86_reg last_off){
@@ -46,20 +46,29 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
     int bit;
     x86_reg coeff_count;
 
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea   "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+    );
+#endif
+
     __asm__ volatile(
         "3:                                     \n\t"
 
-        BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
-                             "%5", "%k0", "%b0",
-                             "%a11(%6)", "%a12(%6)", "%a13", "%a14", "%a15")
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%a11(%6)", "%a12(%6)", "%a13", "%a14", "%a15", "%16")
 
         "test $1, %4                            \n\t"
         " jz 4f                                 \n\t"
         "add  %10, %1                           \n\t"
 
-        BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3",
-                             "%5", "%k0", "%b0",
-                             "%a11(%6)", "%a12(%6)", "%a13", "%a14", "%a15")
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%a11(%6)", "%a12(%6)", "%a13", "%a14", "%a15", "%16")
 
         "sub  %10, %1                           \n\t"
         "mov  %2, %0                            \n\t"
@@ -90,7 +99,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
           "i"(offsetof(CABACContext, bytestream_end)),
           "i"(H264_NORM_SHIFT_OFFSET),
           "i"(H264_LPS_RANGE_OFFSET),
-          "i"(H264_MLPS_STATE_OFFSET)
+          "i"(H264_MLPS_STATE_OFFSET) TABLES_ARG
         : "%"REG_c, "memory"
     );
     return coeff_count;
@@ -105,6 +114,15 @@ static int decode_significance_8x8_x86(CABACContext *c,
     x86_reg last=0;
     x86_reg state;
 
+#ifdef BROKEN_RELOCATIONS
+    void *tables;
+
+    __asm__ volatile(
+        "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
+        : "=&r"(tables)
+    );
+#endif
+
     __asm__ volatile(
         "mov %1, %6                             \n\t"
         "3:                                     \n\t"
@@ -113,21 +131,24 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "movzbl (%0, %6), %k6                   \n\t"
         "add %9, %6                             \n\t"
 
-        BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
-                             "%5", "%k0", "%b0",
-                             "%a12(%7)", "%a13(%7)", "%a14", "%a15", "%a16")
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%a12(%7)", "%a13(%7)", "%a14", "%a15", "%a16", "%18")
 
         "mov %1, %k6                            \n\t"
         "test $1, %4                            \n\t"
         " jz 4f                                 \n\t"
 
+#ifdef BROKEN_RELOCATIONS
+        "movzbl %a17(%18, %q6), %k6\n\t"
+#else
         "movzbl "MANGLE(ff_h264_cabac_tables)"+%a17(%k6), %k6\n\t"
-
+#endif
         "add %11, %6                            \n\t"
 
-        BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3",
-                             "%5", "%k0", "%b0",
-                             "%a12(%7)", "%a13(%7)", "%a14", "%a15", "%a16")
+        BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
+                             "%5", "%q5", "%k0", "%b0",
+                             "%a12(%7)", "%a13(%7)", "%a14", "%a15", "%a16", "%18")
 
         "mov %2, %0                             \n\t"
         "mov %1, %k6                            \n\t"
@@ -157,7 +178,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
           "i"(H264_NORM_SHIFT_OFFSET),
           "i"(H264_LPS_RANGE_OFFSET),
           "i"(H264_MLPS_STATE_OFFSET),
-          "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET)
+          "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
         : "%"REG_c, "memory"
     );
     return coeff_count;
-- 
1.7.1



More information about the ffmpeg-devel mailing list