[Ffmpeg-devel] [patch] PIC fixes for i386

Michael Niedermayer michaelni
Sun Dec 4 01:38:23 CET 2005


i

On Sun, Dec 04, 2005 at 12:36:16AM +0100, Luca Barbato wrote:
> The pax team sent me a nice patch that removes the textrels and 

this must be tested with gcc 2.95, 3.3, 3.4 and 4.0 at minimum, iam pretty
sure it wont work on all, some of these changes reverse gcc bug workarounds
IIRC


> surprisingly seems to speed up a bit everything overall.

if you did any benchmarks, please post them


[...]
>  static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
> +    long dummy;
>      asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
> -        "movd  %4, %%mm0		\n\t"
> -        "movd  %5, %%mm1		\n\t"
> -        "movd  %6, %%mm2		\n\t"
> -        "movd  %7, %%mm3		\n\t"
> +        "movd  (%3), %%mm0		\n\t"
> +        "movd  (%3, %4), %%mm1		\n\t"
> +        "movd  (%3, %4, 2), %%mm2	\n\t"
> +        "lea  (%4, %4, 2), %0		\n\t"

something like add %4, %3 after movd  (%3), %%mm0 should also work with
minor changes below and be faster on some cpus


[...]
> +        "lea  (%2, %2, 2), %0		\n\t"

same as above


[...]
>  #ifdef HAVE_MMX
>  					asm volatile(
> -						"movq (%%"REG_a"), %%mm2	\n\t" // packedYOffset
> -						"movq 8(%%"REG_a"), %%mm3	\n\t" // packedYScale
> -						"lea (%2,%4), %%"REG_a"	\n\t"
> -						"lea (%3,%5), %%"REG_d"	\n\t"
> +						"movq (%0), %%mm2	\n\t" // packedYOffset
> +						"movq 8(%0), %%mm3	\n\t" // packedYScale
>  						"pxor %%mm4, %%mm4	\n\t"
>  #ifdef HAVE_MMX2
>  #define REAL_SCALED_CPY(src1, src2, dst1, dst2)					\
> @@ -3257,21 +3255,20 @@
>  #define SCALED_CPY(src1, src2, dst1, dst2)\
>     REAL_SCALED_CPY(src1, src2, dst1, dst2)
>  
> -SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
> -SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
> -SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
> -						"lea (%%"REG_a",%4,4), %%"REG_a"	\n\t"
> -						"lea (%%"REG_d",%5,4), %%"REG_d"	\n\t"
> -SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
> -
> -
> -						: "=&a" (packedOffsetAndScale)
> -						: "0" (packedOffsetAndScale),
> +SCALED_CPY((%1)       , (%1, %3)      , (%2)       , (%2, %4))
> +SCALED_CPY((%1, %3, 2), (%1, %3, 4)   , (%2, %4, 2), (%2, %4, 4))
> +						"lea (%1,%3,2), %1	\n\t"
> +						"lea (%2,%4,2), %2	\n\t"
> +SCALED_CPY((%1, %3, 2), (%1, %3, 4)   , (%2, %4, 2), (%2, %4, 4))
> +						"lea (%1,%3), %1	\n\t"
> +						"lea (%2,%4), %2	\n\t"
> +SCALED_CPY((%1, %3, 2), (%1, %3, 4)   , (%2, %4, 2), (%2, %4, 4))
> +						: : "r" (packedOffsetAndScale),
>  						"r"(src),
>  						"r"(dst),

this code is wrong

[...]

>  					asm volatile(
> -						"lea (%0,%2), %%"REG_a"	\n\t"
> -						"lea (%1,%3), %%"REG_d"	\n\t"
> -
>  #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2)				\
>  						"movq " #src1 ", %%mm0	\n\t"\
>  						"movq " #src2 ", %%mm1	\n\t"\
> @@ -3296,17 +3290,18 @@
>     REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
>  
>  SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
> -SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
> -SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
> -						"lea (%%"REG_a",%2,4), %%"REG_a"	\n\t"
> -						"lea (%%"REG_d",%3,4), %%"REG_d"	\n\t"
> -SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
> -
> +SIMPLE_CPY((%0, %2, 2), (%0, %2, 4)   , (%1, %3, 2), (%1, %3, 4))
> +						"lea (%0,%2,2), %0	\n\t"
> +						"lea (%1,%3,2), %1	\n\t"
> +SIMPLE_CPY((%0, %2)   , (%0, %2, 4)   , (%1, %3)   , (%1, %3, 4))
> +						"lea (%0,%2), %0	\n\t"
> +						"lea (%1,%3), %1	\n\t"
> +SIMPLE_CPY((%0, %2, 2), (%0, %2, 4)   , (%1, %3, 2), (%1, %3, 4))
>  						: : "r" (src),
>  						"r" (dst),

this too, not to mention that add is faster then lea and should have been used

[...]

-- 
Michael





More information about the ffmpeg-devel mailing list