[FFmpeg-devel] [PATCH 2/3] simple_idct12: align C and x86

Christophe Gisquet christophe.gisquet at gmail.com
Tue Oct 13 21:21:40 CEST 2015


Results for omse on the 3 idct dct-test.

C:   0.16915859   0.11848359   0.12913125
x86: 0.16883281   0.11849063   0.19041875

Using 14 and 17 as shifts subtantially improve those, but actually
cause overflows and incorrect decoding of 12bpp content.
---
 libavcodec/simple_idct_template.c | 17 ++++-------------
 libavcodec/x86/idctdsp_init.c     |  8 +++-----
 libavcodec/x86/simple_idct10.asm  |  7 +++----
 3 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/libavcodec/simple_idct_template.c b/libavcodec/simple_idct_template.c
index 0585679..c94c583 100644
--- a/libavcodec/simple_idct_template.c
+++ b/libavcodec/simple_idct_template.c
@@ -66,7 +66,6 @@
 
 #elif BIT_DEPTH == 10 || BIT_DEPTH == 12
 
-# if BIT_DEPTH == 10
 #define W1 22725 // 90901
 #define W2 21407 //  85627
 #define W3 19265 //  77062
@@ -75,6 +74,7 @@
 #define W6  8867 //  35468
 #define W7  4520 //  18081
 
+# if BIT_DEPTH == 10
 #   ifdef EXTRA_SHIFT
 #define ROW_SHIFT 13
 #define COL_SHIFT 18
@@ -84,19 +84,10 @@
 #define COL_SHIFT 19
 #define DC_SHIFT  2
 #   endif
-
 # else
-#define W1 45451
-#define W2 42813
-#define W3 38531
-#define W4 32767
-#define W5 25746
-#define W6 17734
-#define W7 9041
-
-#define ROW_SHIFT 16
-#define COL_SHIFT 17
-#define DC_SHIFT -1
+#define ROW_SHIFT 15
+#define COL_SHIFT 16
+#define DC_SHIFT  -1
 # endif
 
 #define MUL(a, b)    ((a) * (b))
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index bcf7e5b..8b25ff9 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -86,11 +86,11 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
         c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
     }
 
-    if (ARCH_X86_64 && avctx->lowres == 0) {
-        if (avctx->bits_per_raw_sample == 10 &&
+    if (ARCH_X86_64 && avctx->lowres == 0 &&
         (avctx->idct_algo == FF_IDCT_AUTO ||
          avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
          avctx->idct_algo == FF_IDCT_SIMPLE)) {
+        if (avctx->bits_per_raw_sample == 10) {
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->idct_put  = ff_simple_idct10_put_sse2;
             c->idct_add  = NULL;
@@ -106,9 +106,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
         }
         }
 
-        if (avctx->bits_per_raw_sample == 12 &&
-            (avctx->idct_algo == FF_IDCT_AUTO ||
-             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+        if (avctx->bits_per_raw_sample == 12) {
             if (EXTERNAL_SSE2(cpu_flags)) {
                 c->idct_put  = ff_simple_idct12_put_sse2;
                 c->idct_add  = NULL;
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index cd83d61..c5ee05c 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -29,12 +29,11 @@
 
 SECTION_RODATA
 
-cextern pw_2
-cextern pw_16
 cextern pw_1023
 cextern pw_4095
 pd_round_12: times 4 dd 1<<(12-1)
 pd_round_15: times 4 dd 1<<(15-1)
+pd_round_16: times 4 dd 1<<(16-1)
 pd_round_19: times 4 dd 1<<(19-1)
 
 %macro CONST_DEC  3
@@ -79,14 +78,14 @@ cglobal simple_idct10_put, 3, 3, 16
 cglobal simple_idct12, 1, 1, 16
     ; coeffs are already 15bits, adding the offset would cause
     ; overflow in the input
-    IDCT_FN    "", 15, pw_2, 16
+    IDCT_FN    "", 15, "", 16
     RET
 
 cglobal simple_idct12_put, 3, 3, 16
     ; range isn't known, so the C simple_idct range is used
     ; Also, using a bias on input overflows, so use the bias
     ; on output of the first butterfly instead
-    IDCT_FN    "", 15, pw_2, 16, 0, pw_4095
+    IDCT_FN    "", 15, "", 16, 0, pw_4095
     RET
 %endmacro
 
-- 
2.6.0



More information about the ffmpeg-devel mailing list