[Ffmpeg-devel] [PATCH] lowres chroma bug

Trent Piepho xyzzy
Wed Feb 7 07:46:41 CET 2007


On Wed, 7 Feb 2007, Oleg Metelitsa wrote:
> MN> no because it slows the non lowres code down
>
> Please find a patch below and a short explanation why the "lowres
> chroma bug" appears.
>
>              H264_CHROMA_OP4(%0, %%mm1, %%mm3)
> -            "movd %%mm1, %0\n\t"
> -            : "=m" (dst[0]) : "m" (ff_pw_32));
> +            "movd %%mm1, %%edx\n\t"
> +            "mov %%dx, %0\n\t"
> +            : "=m" (dst[0]) : "m" (ff_pw_32): "%edx");
>          dst += stride;
>      }

Instead of hardcoding the scratch register, it's better to let gcc choose a
register.  Like this:

--- i386/dsputil_h264_template_mmx.c	(revision 7867)
+++ i386/dsputil_h264_template_mmx.c	(working copy)
@@ -310,15 +310,16 @@

+        { uint32_t scratch;  /* for asm scratch register */
         asm volatile(
             /* dst[0,1] = pack((mm1 + 32) >> 6) */
-            "paddw %1, %%mm1\n\t"
+            "paddw %2, %%mm1\n\t"
             "psrlw $6, %%mm1\n\t"
             "packssdw %%mm7, %%mm1\n\t"
             "packuswb %%mm7, %%mm1\n\t"
             /* writes garbage to the right of dst.
              * ok because partitions are processed from left to right. */
             H264_CHROMA_OP4(%0, %%mm1, %%mm3)
-            "movd %%mm1, %0\n\t"
-            : "=m" (dst[0]) : "m" (ff_pw_32));
+            "movd %%mm1, %1\n\t"
+            "movw %w1, %0\n\t"
+            : "=m" (dst[0]), "=q" (scratch) : "m" (ff_pw_32));
+        }
         dst += stride;
     }
 }

> As  a  result, a 16-bit word is written into the memory + a "garbage"
> 16-bit  word.  The rows in the image are processed from left to right.
> It  was  expected  that  the  garbage  will  be rewritten in the next
> writing circle.

It seems like this will only work for put_h264_chroma_mc2_mmx2(), for
avg_h264_chroma_mc2_mmx2() it doesn't work.  When the garbage word is
written on one call, the next call will average with that garbage data.

This can be fixed, without changing the put version, by adding two mmx
instructions to the average op:

--- i386/h264dsp_mmx.c  (revision 7867)
+++ i386/h264dsp_mmx.c  (working copy)
@@ -1388,7 +1388,9 @@
 #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
-                               "pavgb " #T ", " #D " \n\t"
+                               "pavgb " #T ", " #D " \n\t"\
+                               "psrld $16, " #T "\n\t"\
+                               "punpcklwd " #T ", " #D "\n\t"

Of course that would only be done for avg_h264_chroma_mc2_mmx2, not for
avg_h264_chroma_mc{4,8}_mmx2.  Maybe this is faster than the using the
16-bit move?  The same can be done for the put version too:

@@ -1376,1 +1376,2 @@
-#define H264_CHROMA_OP4(S,D,T)
+#define H264_CHROMA_OP4(S,D,T) "movd 2+" #S ", " #T "\n\t"\
+                               "punpcklwd " #T ", " #D "\n\t"

I tried benchmarking with mplayer and -nosound -vo null -benchmark -quiet,
but the difference was too small to be statistically significant compared
to the sampling error.

I put a rdtsc counter around the {avg,put}_h264_chroma_mc2_mmx2()
functions, and that showed that the version that uses mmx code was 4.9%
slower and the version that does a movd to a register and then a word move
was 13.8% slower than the original (buggy) version.

Since the word move version uses another register, the rdtsc code might
have made a bigger difference in it's speed than the other versions.

I've attached a patch that has both versions with a #if to select between
them.
-------------- next part --------------
Index: i386/dsputil_h264_template_mmx.c
===================================================================
--- i386/dsputil_h264_template_mmx.c	(revision 7867)
+++ i386/dsputil_h264_template_mmx.c	(working copy)
@@ -310,15 +310,28 @@
 
         asm volatile(
             /* dst[0,1] = pack((mm1 + 32) >> 6) */
-            "paddw %1, %%mm1\n\t"
+            "paddw %0, %%mm1\n\t"
             "psrlw $6, %%mm1\n\t"
             "packssdw %%mm7, %%mm1\n\t"
             "packuswb %%mm7, %%mm1\n\t"
+            :: "m" (ff_pw_32));
+
+#if 1  // 0 for movd/movw version, 1 for mmx version
+        asm volatile(
+            H264_CHROMA_OP2(%0, %%mm1, %%mm3)
+            "movd %%mm1, %0\n\t"
+            : "=m" (dst[0]));
+#else
+        { uint32_t scratch;  /* for asm scratch register */
+        asm volatile(
             /* writes garbage to the right of dst.
              * ok because partitions are processed from left to right. */
             H264_CHROMA_OP4(%0, %%mm1, %%mm3)
-            "movd %%mm1, %0\n\t"
-            : "=m" (dst[0]) : "m" (ff_pw_32));
+            "movd %%mm1, %1\n\t"
+            "movw %w1, %0\n\t"
+            : "=m" (dst[0]), "=q" (scratch));
+        }
+#endif
         dst += stride;
     }
 }
Index: i386/h264dsp_mmx.c
===================================================================
--- i386/h264dsp_mmx.c	(revision 7867)
+++ i386/h264dsp_mmx.c	(working copy)
@@ -1374,6 +1374,8 @@
 
 #define H264_CHROMA_OP(S,D)
 #define H264_CHROMA_OP4(S,D,T)
+#define H264_CHROMA_OP2(S,D,T) "movd 2+" #S ", " #T "\n\t"\
+                               "punpcklwd " #T ", " #D "\n\t"
 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
@@ -1381,6 +1383,7 @@
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
+#undef H264_CHROMA_OP2
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
 #undef H264_CHROMA_MC2_TMPL
@@ -1389,6 +1392,10 @@
 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
 #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
                                "pavgb " #T ", " #D " \n\t"
+#define H264_CHROMA_OP2(S,D,T) "movd  " #S ", " #T " \n\t"\
+                               "pavgb " #T ", " #D " \n\t"\
+                               "psrld $16, " #T "\n\t"\
+                               "punpcklwd " #T ", " #D "\n\t"
 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
@@ -1396,6 +1403,7 @@
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
+#undef H264_CHROMA_OP2
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
 #undef H264_CHROMA_MC2_TMPL



More information about the ffmpeg-devel mailing list