[Ffmpeg-devel] PATCH Blackfin optimized byte swapping mechanism

Marc Hoffman mmh
Mon Apr 23 19:12:47 CEST 2007


Michael Niedermayer writes:
 > Hi
 > 
 > On Tue, Apr 17, 2007 at 08:49:40AM -0400, Marc Hoffman wrote:
 > > Michael Niedermayer writes:
 > >  > Hi
 > >  > 
 > >  > On Tue, Apr 17, 2007 at 07:40:47AM -0400, Marc Hoffman wrote:
 > >  > Content-Description: message body text
 > >  > > 
 > >  > >  > Low level bswap primitive for the Blackfin Architecture.
 > >  > > 
 > >  > > sorry mangled patch wrong encoding last time.
 > >  > 
 > >  > what advantage do these functions have over the default?
 > >  > are they faster? if so you should provide some benchmarks
 > > 
 > > Sorry about the top post please forgive me
 > > 
 > > The current 32bit byte swap routine produces this code sequence
 > > 
 > > So I guess this is about 300% improvement in performance for this function.
 > 
 > guess is good, hard benchmark is better, its just 5min work to write a
 > loop of bswap and do a time myprog
 > also dont forget to set proper -mcpu / -march and -O3 with gcc

correction ~200%. Is the patch acceptable now?

yoda:~/bs mmh$ bfin-linux-uclibc-gcc -O3 bswap.c -o bswap/bs/bswap
yoda:~/bs mmh$ rsh -l root mad /u/bs/bswap
fast is 12608161
slow is 24637378
improvement: 195.408180
yoda:~/bs mmh$ 


#include <stdio.h>

#define av_always_inline 
typedef unsigned int uint32_t;

static av_always_inline uint32_t fast_bswap_32(uint32_t x){
    unsigned tmp;
    asm("%1 = %0 >> 8 (V);\n\t"
        "%0 = %0 << 8 (V);\n\t"
        "%0 = %0 | %1;\n\t"
        "%0 = PACK(%0.L, %0.H);\n\t"
        : "+d"(x), "=&d"(tmp));
    return x;
}

static av_always_inline uint32_t slow_bswap_32(uint32_t x){
    x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
    return (x>>16) | (x<<16);
}

unsigned long buf[2048];

#define clock()      ({ int _t; asm volatile ("%0=cycles;" : "=d" (_t)); _t; })
#define clockdiff(x) ({ int _t; asm volatile ("%0=cycles; %0=%0-%1;" : "=d" (_t) : "d" (x)); _t; })

main ()
{
  int j;
  int i;
  long st,t0,t1;

  for (i=0; i < 2048;i++) {
    buf[i]=0x11223344;
  }
  
  st = clock ();
  for (j=0;j<1000;j++) {
    for (i=0;i<2048;i++) {
      buf[i]= fast_bswap_32(buf[i]);
    }
  }
  t0 = clockdiff(st);

  st = clock ();
  for (j=0;j<1000;j++) {
    for (i=0;i<2048;i++) {
      buf[i]= slow_bswap_32(buf[i]);
    }
  }
  t1 = clockdiff(st);
  
  printf ("fast is %d\nslow is %d\nimprovement: %f\n", t0,t1, 100.0*t1/t0);
}




More information about the ffmpeg-devel mailing list