[FFmpeg-devel] [PATCH] h264: assembly version of get_cabac for x86_64 with PIC

Roland Scheidegger rscheidegger_lists at hispeed.ch
Fri Apr 13 16:34:34 CEST 2012


This adds a hand-optimized assembly version for get_cabac much like the
existing one, but it works if the table offsets are RIP-relative.
Compared to the non-RIP-relative version this adds 2 lea instructions
and it needs one extra register.
Since x86_64 cpus always support cmov also always use this (I don't care
if you have a P4 Prescott whose cmov implementation is useless).
There is a surprisingly large performance improvement over the c version (more
so than the generated assembly seems to suggest) just in get_cabac, I measured
roughly 40% faster for get_cabac on a K8.
There are similar functions which could get the same treatment but they
are less frequently used and since this isn't very nice as we can't use the
same assembly template focus on this function alone for now.
v2: incorporated feedback from Loren Merritt to avoid rip-relative movs
for every table, and got rid of unnecessary @GOTPCREL.
---
 libavcodec/x86/cabac.h |   65 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 65 insertions(+), 0 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 0c4419b..4f275f9 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -24,8 +24,72 @@
 #include "libavcodec/cabac.h"
 #include "libavutil/attributes.h"
 #include "libavutil/x86_cpu.h"
+#include "libavutil/internal.h"
 #include "config.h"
 
+#if defined(BROKEN_RELOCATIONS)
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, tmp2q, byte, end) \
+        "1:                                                             \n\t"\
+        "movzbl "statep"    , "ret"                                     \n\t"\
+        "mov    "range"     , "tmp"                                     \n\t"\
+        "and    $0xC0       , "range"                                   \n\t"\
+        "lea    1b(%%rip)   , "tmp2q"                                   \n\t"\
+        "lea    ("ret", "range", 2), %%ecx                              \n\t"\
+        "movzbl ff_h264_lps_range-1b("tmp2q", %%rcx), "range"           \n\t"\
+        "sub    "range"     , "tmp"                                     \n\t"\
+        "mov    "tmp"       , %%ecx                                     \n\t"\
+        "shl    $17         , "tmp"                                     \n\t"\
+        "cmp    "low"       , "tmp"                                     \n\t"\
+        "cmova  %%ecx       , "range"                                   \n\t"\
+        "sbb    %%rcx       , %%rcx                                     \n\t"\
+        "and    %%ecx       , "tmp"                                     \n\t"\
+        "xor    %%rcx       , "retq"                                    \n\t"\
+        "sub    "tmp"       , "low"                                     \n\t"\
+        "movzbl ff_h264_norm_shift-1b("tmp2q", "rangeq"), %%ecx         \n\t"\
+        "shl    %%cl        , "range"                                   \n\t"\
+        "movzbl ff_h264_mlps_state-1b+128("tmp2q", "retq"), "tmp"       \n\t"\
+        "shl    %%cl        , "low"                                     \n\t"\
+        "mov    "tmpbyte"   , "statep"                                  \n\t"\
+        "test   "lowword"   , "lowword"                                 \n\t"\
+        " jnz   2f                                                      \n\t"\
+        "mov    "byte"      , %%"REG_c"                                 \n\t"\
+        "add"OPSIZE" $2     , "byte"                                    \n\t"\
+        "movzwl (%%"REG_c") , "tmp"                                     \n\t"\
+        "lea    -1("low")   , %%ecx                                     \n\t"\
+        "xor    "low"       , %%ecx                                     \n\t"\
+        "shr    $15         , %%ecx                                     \n\t"\
+        "bswap  "tmp"                                                   \n\t"\
+        "shr    $15         , "tmp"                                     \n\t"\
+        "movzbl ff_h264_norm_shift-1b("tmp2q", %%rcx), %%ecx            \n\t"\
+        "sub    $0xFFFF     , "tmp"                                     \n\t"\
+        "neg    %%ecx                                                   \n\t"\
+        "add    $7          , %%ecx                                     \n\t"\
+        "shl    %%cl        , "tmp"                                     \n\t"\
+        "add    "tmp"       , "low"                                     \n\t"\
+        "2:                                                             \n\t"
+
+#define get_cabac_inline get_cabac_inline_x86
+static av_always_inline int get_cabac_inline_x86(CABACContext *c,
+                                                 uint8_t *const state)
+{
+    int bit, tmp, tmp2;
+
+    __asm__ volatile(
+        BRANCHLESS_GET_CABAC("%0", "%q0", "(%5)", "%1", "%w1",
+                             "%2", "%q2", "%3", "%b3", "%q4",
+                             "%a7(%6)", "%a8(%6)")
+        : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp), "=&r"(tmp2)
+        : "r"(state), "r"(c),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end))
+        : "%"REG_c, "memory"
+    );
+    return bit & 1;
+}
+
+
+#else
+
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
         "mov    "tmp"       , %%ecx     \n\t"\
@@ -103,6 +167,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
     return bit & 1;
 }
 #endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif
 
 #define get_cabac_bypass_sign get_cabac_bypass_sign_x86
 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
-- 
1.7.1



More information about the ffmpeg-devel mailing list