[Ffmpeg-devel] [PATCH] fix mpeg4 lowres chroma bug and increase h264/mpeg4 MC speed

Trent Piepho xyzzy
Fri Feb 9 09:03:47 CET 2007


On Thu, 8 Feb 2007, Michael Niedermayer wrote:
> On Thu, Feb 08, 2007 at 02:48:53AM -0800, Trent Piepho wrote:
> [...]
> > Anyway, there is an obvious way to make it faster that we both missed the
> > first time:
> >
> > #define H264_CHROMA_OP2(S,D,T) "punpcklwd 2+" #S ", " #D "\n\t"
> >
> > This is about 4.38% faster than the my first patch, and 17.4% faster than
> > the original code.
>
> but slower then what is in svn which is what matters (it slows h.264 down)

No, it's not slower, it's faster.  I calculated that last percentage
incorrectly, it should be 15.4%.  That means, it is 15.4% faster than what
is in svn now, in addition to working correctly, which the svn code does
not.

Do you disagree with me that avg_h264_chroma_mc4_mmx2 is completely broken?

put_h264_chroma_mc4_mmx2() can work in for h264, but you clobber random
memory after the end of the image.  If you increase the stride and pad the
end of the image, you use more memory (cache effects, vo's that don't like
gaps between lines), which could make things slower too.

IMHO, it's not obvious that the padding method will be faster than what
I've come up with.

> could you send seperate patches for each separate change

Ok, I'm sending two patches.  The first patch fixes the functions so they
both work.  The second uses a table lookup instead of a multiply to speed
up the calculation of x*y.  I benchmarked put_h264_chroma_mc4_mmx2() with
rdtsc, using 1200 frames (fits in disk cache) of the Elephant Dream's 1024
MPEG4 avi with lowres=2.  mplayer (with correct options) was run 50 times
in a row for each version, then that was repeated 3 times (total 150 runs
per version).

Estimated relative speed improvement against current svn:
fixed	       -5.68%
svn		0.00%
table	       16.04%
fixed+table    14.77%

The version with both patches is only 1.51% slower than if just the table
lookup is applied (which will not fix the bugs), and is still 14.77% faster
than what is in svn now.
-------------- next part --------------
Index: i386/dsputil_h264_template_mmx.c
===================================================================
--- i386/dsputil_h264_template_mmx.c	(revision 7884)
+++ i386/dsputil_h264_template_mmx.c	(working copy)
@@ -310,15 +310,13 @@
 
         asm volatile(
             /* dst[0,1] = pack((mm1 + 32) >> 6) */
-            "paddw %1, %%mm1\n\t"
+            "paddw %2, %%mm1\n\t"
             "psrlw $6, %%mm1\n\t"
             "packssdw %%mm7, %%mm1\n\t"
             "packuswb %%mm7, %%mm1\n\t"
-            /* writes garbage to the right of dst.
-             * ok because partitions are processed from left to right. */
-            H264_CHROMA_OP4(%0, %%mm1, %%mm3)
+            H264_CHROMA_OP2(%0, %1, %%mm1, %%mm3)
             "movd %%mm1, %0\n\t"
-            : "=m" (dst[0]) : "m" (ff_pw_32));
+            : "=m" (dst[0]), "=m" (dst[2]) : "m" (ff_pw_32));
         dst += stride;
     }
 }
Index: i386/h264dsp_mmx.c
===================================================================
--- i386/h264dsp_mmx.c	(revision 7884)
+++ i386/h264dsp_mmx.c	(working copy)
@@ -1374,6 +1374,7 @@
 
 #define H264_CHROMA_OP(S,D)
 #define H264_CHROMA_OP4(S,D,T)
+#define H264_CHROMA_OP2(S,S2,D,T) "punpcklwd " #S2 ", " #D "\n\t"
 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
@@ -1381,6 +1382,7 @@
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
+#undef H264_CHROMA_OP2
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
 #undef H264_CHROMA_MC2_TMPL
@@ -1389,6 +1391,10 @@
 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
 #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
                                "pavgb " #T ", " #D " \n\t"
+#define H264_CHROMA_OP2(S,S2,D,T) "movd  " #S ", " #T "\n\t"\
+                                  "pavgb " #T ", " #D "\n\t"\
+                                  "psrld $16, " #T "\n\t"\
+                                  "punpcklwd " #T ", " #D "\n\t"
 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
@@ -1396,6 +1402,7 @@
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
+#undef H264_CHROMA_OP2
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
 #undef H264_CHROMA_MC2_TMPL
-------------- next part --------------
--- i386/h264dsp_mmx.c.old	2007-02-08 20:18:07.000000000 -0800
+++ i386/h264dsp_mmx.c	2007-02-08 20:18:47.000000000 -0800
@@ -1372,6 +1372,16 @@
 H264_MC(avg_, 16,mmx2)
 
 
+static const uint8_t xtimesy[8][8] = { 
+	{ 0, 0, 0, 0, 0, 0, 0, 0},
+	{ 0, 1, 2, 3, 4, 5, 6, 7},
+	{ 0, 2, 4, 6, 8,10,12,14},
+	{ 0, 3, 6, 9,12,15,18,21},
+	{ 0, 4, 8,12,16,20,24,28},
+	{ 0, 5,10,15,20,25,30,35},
+	{ 0, 6,12,18,24,30,36,42},
+	{ 0, 7,14,21,28,35,42,49} };
+
 #define H264_CHROMA_OP(S,D)
 #define H264_CHROMA_OP4(S,D,T)
 #define H264_CHROMA_OP2(S,S2,D,T) "punpcklwd " #S2 ", " #D "\n\t"
--- i386/dsputil_h264_template_mmx.c.old	2007-02-08 20:18:07.000000000 -0800
+++ i386/dsputil_h264_template_mmx.c	2007-02-08 20:18:40.000000000 -0800
@@ -265,7 +265,7 @@
 #ifdef H264_CHROMA_MC2_TMPL
 static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
 {
-    int CD=((1<<16)-1)*x*y + 8*y;
+    int CD=((1<<16)-1)*xtimesy[x][y] + 8*y;
     int AB=((8<<16)-8)*x + 64 - CD;
     int i;
 



More information about the ffmpeg-devel mailing list