[FFmpeg-devel] [PATCH] swscale alpha channel support

Michael Niedermayer michaelni
Thu Mar 5 22:25:44 CET 2009


On Thu, Mar 05, 2009 at 03:09:26PM +0100, C?dric Schieli wrote:
> 2009/3/2 Michael Niedermayer <michaelni at gmx.at>:
> > On Fri, Feb 27, 2009 at 11:30:25PM +0100, C?dric Schieli wrote:
> 
> [...]
> 
> >> @@ -972,6 +986,14 @@
> >> ? ? ? ? ? ? ? ? ? ? ?: "%"REG_a
> >> ? ? ? ? ? ? ? ? ?);
> >> ? ? ? ? ? ? ?}
> >> + ? ? ? ? ? ?if (CONFIG_SWSCALE_ALPHA && aDest){
> >> + ? ? ? ? ? ? ? ?__asm__ volatile(
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2YV121_ACCURATE
> >> + ? ? ? ? ? ? ? ? ? ?:: "r" (alpSrc+dstW), "r" (aDest+dstW),
> >> + ? ? ? ? ? ? ? ? ? ?"g" (-dstW)
> >> + ? ? ? ? ? ? ? ? ? ?: "%"REG_a
> >> + ? ? ? ? ? ? ? ?);
> >> + ? ? ? ? ? ?}
> >> ? ? ? ? ?}else{
> >> ? ? ? ? ? ? ?while(p--){
> >> ? ? ? ? ? ? ? ? ?__asm__ volatile(
> >> @@ -981,6 +1003,14 @@
> >> ? ? ? ? ? ? ? ? ? ? ?: "%"REG_a
> >> ? ? ? ? ? ? ? ? ?);
> >> ? ? ? ? ? ? ?}
> >> + ? ? ? ? ? ?if (CONFIG_SWSCALE_ALPHA && aDest){
> >> + ? ? ? ? ? ? ? ?__asm__ volatile(
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2YV121
> >> + ? ? ? ? ? ? ? ? ? ?:: "r" (alpSrc+dstW), "r" (aDest+dstW),
> >> + ? ? ? ? ? ? ? ? ? ?"g" (-dstW)
> >> + ? ? ? ? ? ? ? ? ? ?: "%"REG_a
> >> + ? ? ? ? ? ? ? ?);
> >> + ? ? ? ? ? ?}
> >> ? ? ? ? ?}
> >> ? ? ? ? ?return;
> >> ? ? ?}
> >
> > i would prefer if these wouldnt be duplicated in the generated code
> 
> patch updated
> 
> > [...]
> >> @@ -1095,11 +1147,28 @@
> >> ? ? ? ? ? ? ?switch(c->dstFormat)
> >> ? ? ? ? ? ? ?{
> >> ? ? ? ? ? ? ?case PIX_FMT_RGB32:
> >> + ? ? ? ? ? ? ? ?if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2PACKEDX
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2RGBX
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? ? ? ? ?%%mm2, "U_TEMP"(%0) ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? ? ? ? ?%%mm4, "V_TEMP"(%0) ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? ? ? ? ?%%mm5, "Y_TEMP"(%0) ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET)
> >> + ? ? ? ? ? ? ? ? ? ?"psraw ? ? ? ? ? ? ? ? ? ? ? ?$3, %%mm1 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"psraw ? ? ? ? ? ? ? ? ? ? ? ?$3, %%mm7 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"packuswb ? ? ? ? ? ? ? ? ?%%mm7, %%mm1 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? "U_TEMP"(%0), %%mm2 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? "V_TEMP"(%0), %%mm4 ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"movq ? ? ? ? ? ? ? "Y_TEMP"(%0), %%mm5 ? ? ? ? \n\t"
> >
> > it seems that YSCALEYUV2PACKEDX_YA could be changed to take teh registers as
> > parameters to avoid the movq TEMP stuff ?
> > also same applies to all other cases where its possible
> 
> patch updated
> 
> > [...]
> >> @@ -1191,6 +1260,32 @@
> >> ? ? ? ? ?{
> >> ? ? ? ? ? ? ?//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
> >> ? ? ? ? ? ? ?case PIX_FMT_RGB32:
> >> + ? ? ? ? ? ? ? ?if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
> >> + ? ? ? ? ? ? ? ? ? ?*(uint16_t **)(&c->u_temp)=abuf0;
> >> + ? ? ? ? ? ? ? ? ? ?*(uint16_t **)(&c->v_temp)=abuf1;
> >> + ? ? ? ? ? ? ? ? ? ?__asm__ volatile(
> >> + ? ? ? ? ? ? ? ? ? ?"mov %%"REG_b", "ESP_OFFSET"(%5) ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"mov ? ? ? ?%4, %%"REG_b" ? ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"push %%"REG_BP" ? ? ? ? ? ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2RGB(%%REGBP, %5)
> >> + ? ? ? ? ? ? ? ? ? ?"push ? ? ? ? ? ? ? ? ? %0 ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"push ? ? ? ? ? ? ? ? ? %1 ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"mov ? ? ? ? ?"U_TEMP"(%5), %0 ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"mov ? ? ? ? ?"V_TEMP"(%5), %1 ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?YSCALEYUV2RGB_YA(%%REGBP, %5)
> >> + ? ? ? ? ? ? ? ? ? ?"psraw ? ? ? ? ? ? ? ? ?$3, %%mm1 ? ? ? \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
> >> + ? ? ? ? ? ? ? ? ? ?"psraw ? ? ? ? ? ? ? ? ?$3, %%mm7 ? ? ? \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
> >> + ? ? ? ? ? ? ? ? ? ?"packuswb ? ? ? ? ? ?%%mm7, %%mm1 ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"pop ? ? ? ? ? ? ? ? ? ?%1 ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"pop ? ? ? ? ? ? ? ? ? ?%0 ? ? ? ? ? ? ?\n\t"
> >> + ? ? ? ? ? ? ? ? ? ?WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
> >> + ? ? ? ? ? ? ? ? ? ?"pop %%"REG_BP" ? ? ? ? ? ? ? ? ? ? ? ? \n\t"
> >> + ? ? ? ? ? ? ? ? ? ?"mov "ESP_OFFSET"(%5), %%"REG_b" ? ? ? ?\n\t"
> >> +
> >> + ? ? ? ? ? ? ? ? ? ?:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
> >> + ? ? ? ? ? ? ? ? ? ?"a" (&c->redDither)
> >
> > the push/pop in the inner loop can at least on x86_64 be avoided
> 
> patch updated
> 
> > [...]
> >
> >> +static inline void RENAME(bgr32ToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){
> >> + ? ?int i;
> >> + ? ?for (i=0; i<width; i++){
> >> +#ifdef WORDS_BIGENDIAN
> >> + ? ? ? ?dst[i]= ((uint32_t *)src)[i]&0xFF;
> >> +#else
> >> + ? ? ? ?dst[i]= ((uint32_t *)src)[i]>>24;
> >> +#endif
> >> + ? ?}
> >> +}
> >> +
> >> +static inline void RENAME(bgr32_1ToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){
> >> + ? ?int i;
> >> + ? ?for (i=0; i<width; i++){
> >> +#ifdef WORDS_BIGENDIAN
> >> + ? ? ? ?dst[i]= ((uint32_t *)src)[i]>>24;
> >> +#else
> >> + ? ? ? ?dst[i]= ((uint32_t *)src)[i]&0xFF;
> >> +#endif
> >> + ? ?}
> >> +}
> >> +
> >
> > for(i=0; i<w; i++)
> > ? ?dst[i]= src[4*i];
> > and adjust src when calling
> > 1 function less
> 
> patch updated
> 
> 
> Here is the updated patch set :
> 

> #1 : sws_yuva420p_isPlanarYUV.patch
> YUVA420P is a planar YUV format, isn't it ?

yes, patch ok


> 
> #2 : sws_scale_use_4_planes.patch
> don't ignore 4th plane

ok


[...]
> #5 : sws_configure_alpha.patch
> add --swscale-alpha configure option

not maintained by me


> 
> #6 : sws_yuva2rgb.patch
> updated to have a mmx version even without HAVE_7REGS

I would prefer to have simple code over working around gcc bugs
with #ifdefs


> 
> #7 : sws_scale_alpha.patch
> updated

comments below


> 
> #8 : sws_output_yuva420p.patch
> updated

comments below


> 
> #9 : swscale-example_use_alpha.patch
> updated
> the second hunk in this one needs some explanations :
> without it, swscale-example will segfault
> even without any of my patches, simply adding a uint16_t** field to
> struct SwsContext and av_malloc'ing between 13 and 236 bytes (or 26
> and 408 on x86_64) will make swscale-example to segfault (see
> bug.diff)
> after some debugging, I found that my alpPixBuf data get corrupted somehow
> disabling MMX code with --disable-mmx prevent this from happening
> running swscale-example under valgrind (even without bug.diff) will segfault
> can someone can help on this ?

in swscale_internal.h there are #defines that define the offsets of
various fields in the struct so the asm can access them, you dont maybe
mess that up by adding a field?


and if valgrind itself segfaults, that should be reported to the valgrind
devs ...


#7
[...]
> @@ -1191,6 +1254,52 @@
>          {
>              //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
>              case PIX_FMT_RGB32:
> +                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
> +#if !ARCH_X86_64
> +                    *(uint16_t **)(&c->u_temp)=abuf0;
> +                    *(uint16_t **)(&c->v_temp)=abuf1;
> +#endif
> +                    __asm__ volatile(
> +#if !ARCH_X86_64
> +                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
> +#endif
> +                    "mov        %4, %%"REG_b"               \n\t"
> +#if !ARCH_X86_64
> +                    "push %%"REG_BP"                        \n\t"
> +#endif
> +                    YSCALEYUV2RGB(%%REGBP, %5)
> +#if !ARCH_X86_64
> +                    "push                   %0              \n\t"
> +                    "push                   %1              \n\t"
> +                    "mov          "U_TEMP"(%5), %0          \n\t"
> +                    "mov          "V_TEMP"(%5), %1          \n\t"
> +#endif
> +#if ARCH_X86_64
> +                    YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
> +#else
> +                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
> +#endif
> +                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
> +                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
> +                    "packuswb            %%mm7, %%mm1       \n\t"
> +#if !ARCH_X86_64
> +                    "pop                    %1              \n\t"
> +                    "pop                    %0              \n\t"
> +#endif
> +                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
> +#if !ARCH_X86_64
> +                    "pop %%"REG_BP"                         \n\t"
> +                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
> +#endif
> +
> +                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
> +                    "a" (&c->redDither)
> +#if ARCH_X86_64
> +                    ,"r" (abuf0), "r" (abuf1)
> +                    : "%"REG_b, "%"REG_BP
> +#endif
> +                    );
> +                }else{
>                  __asm__ volatile(
>                  "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
>                  "mov        %4, %%"REG_b"               \n\t"

ehm, this looks really messy ...

[...]

> @@ -2144,11 +2293,17 @@
>      }
>      else if (srcFormat==PIX_FMT_RGB32)
>      {
> +        if (isAlpha)
> +            RENAME(bgr32ToA)(formatConvBuffer, src+3, srcW, pal);
> +        else
>          RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
>          src= formatConvBuffer;
>      }
>      else if (srcFormat==PIX_FMT_RGB32_1)
>      {
> +        if (isAlpha)
> +            RENAME(bgr32ToA)(formatConvBuffer, src, srcW, pal);
> +        else
>          RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
>          src= formatConvBuffer;
>      }
> @@ -2169,11 +2324,17 @@
>      }
>      else if (srcFormat==PIX_FMT_BGR32)
>      {
> +        if (isAlpha)
> +            RENAME(bgr32ToA)(formatConvBuffer, src+3, srcW, pal);
> +        else
>          RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
>          src= formatConvBuffer;
>      }
>      else if (srcFormat==PIX_FMT_BGR32_1)
>      {
> +        if (isAlpha)
> +            RENAME(bgr32ToA)(formatConvBuffer, src, srcW, pal);
> +        else
>          RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
>          src= formatConvBuffer;
>      }

This doesnt look like it would work on big endian systems


[...]

>  
> +    if (CONFIG_SWSCALE_ALPHA && (dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
> +        memset(dst[3], 255, dstStride[3]*(dstY-lastDstY));
> +

you cant write outside 0..width per line, also stride can be negative


[...]

#8
> Index: ffmpeg/libswscale/swscale.c
> ===================================================================
> --- ffmpeg.orig/libswscale/swscale.c	2009-03-03 09:31:07.000000000 +0100
> +++ ffmpeg/libswscale/swscale.c	2009-03-03 10:00:28.000000000 +0100
> @@ -133,6 +133,7 @@
>      )
>  #define isSupportedOut(x)   (       \
>             (x)==PIX_FMT_YUV420P     \
> +        || (x)==PIX_FMT_YUVA420P    \
>          || (x)==PIX_FMT_YUYV422     \
>          || (x)==PIX_FMT_UYVY422     \
>          || (x)==PIX_FMT_YUV444P     \
> @@ -2053,12 +2054,16 @@
>                        int srcSliceH, uint8_t* dst[], int dstStride[])
>  {
>      int plane;
> -    for (plane=0; plane<3; plane++)
> +    for (plane=0; plane<4; plane++)
>      {
> -        int length= plane==0 ? c->srcW  : -((-c->srcW  )>>c->chrDstHSubSample);
> -        int y=      plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
> -        int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
> -
> +        int length= (plane==0 || plane==3) ? c->srcW  : -((-c->srcW  )>>c->chrDstHSubSample);
> +        int y=      (plane==0 || plane==3) ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
> +        int height= (plane==0 || plane==3) ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
> +

> +        if (((!isALPHA(c->srcFormat)) || (!isALPHA(c->dstFormat))) && plane == 3){

!(isALPHA(c->srcFormat) && isALPHA(c->dstFormat)) && plane == 3


> +            if (isALPHA(c->dstFormat))
> +                memset(dst[3], 255, dstStride[3]*height);

as said elewhere writing must be limited to 0..width

[...]



-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Dictatorship naturally arises out of democracy, and the most aggravated
form of tyranny and slavery out of the most extreme liberty. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090305/808a0fed/attachment.pgp>



More information about the ffmpeg-devel mailing list