[FFmpeg-devel] [RFC] snow SSE2 optimizations (was: Re: [FFmpeg-cvslog] r10223 - in trunk/libavcodec/i386: dsputil_mmx.c snowdsp_mmx.c)

Tue Aug 28 00:07:02 CEST 2007

Hello,
On Mon, Aug 27, 2007 at 11:34:44PM +0200, Michael Niedermayer wrote:
> > > also theres some shift by 4 missing here
> > 
> > I don't think so, there is a "psraw $4, %%xmm0               \n\t"
> > further down. And I know the code is an unreadable mess. I'll try to
> > reimplement it somewhen if noone else will do it...
> 
> the daa after obmc is 16bit unsigned, the data after the IDWT is 13bit
> signed the white point differs by a factor of 16 a shift by 4 is needed to get
> them on the same level before adding ...

Right, right, I just missed a few lines of code while reading the C
version, thus the confusion.
Since the diff is unreadable, do you think the following is better than
the current code (I mean visually, it does decode correctly after all ;-),
though it is not measurably faster than the mmx code on my PC):

#define load_block_twolines(block, dst1, dst2) \
             "mov "PTR_SIZE"*"#block"(%%"REG_a"), %%"REG_d" \n\t"\
             "movq (%%"REG_d"           ), "dst1"           \n\t"\
             "movq (%%"REG_d", %%"REG_c"), "dst2"           \n\t"\
             "punpcklbw          %%xmm7, "dst1"             \n\t"\
             "punpcklbw          %%xmm7, "dst2"             \n\t"

#define load_obmc_twolines(offset, stride, dst1, dst2) \
             "movq           "#offset"(%%"REG_S"), "dst1"   \n\t"\
             "movq "#stride"+"#offset"(%%"REG_S"), "dst2"   \n\t"\
             "punpcklbw          %%xmm7, "dst1"             \n\t"\
             "punpcklbw          %%xmm7, "dst2"             \n\t"

#define inc_block(delta) \
             "add "delta", "PTR_SIZE"*3(%%"REG_a")  \n\t"\
             "add "delta", "PTR_SIZE"*2(%%"REG_a")  \n\t"\
             "add "delta", "PTR_SIZE"*1(%%"REG_a")  \n\t"\
             "add "delta", (%%"REG_a")              \n\t"

static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t
*obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
                      int src_x, int src_y, long src_stride,
slice_buffer * sb, int add, uint8_t * dst8){
    IDWTELEM **dst_array = sb->line + src_y;
    long tmp;
    asm volatile (
        "mov  %7, %%"REG_c"                     \n\t"
        "mov  %6, %2                            \n\t"
        "mov  %4, %%"REG_S"                     \n\t"
        "pxor %%xmm7, %%xmm7                    \n\t"
        "pcmpeqd %%xmm3, %%xmm3                 \n\t"
        "psllw $15, %%xmm3                      \n\t"
        "psrlw $12, %%xmm3                      \n\t"
        "1:                                     \n\t"
        "mov %1, %%"REG_D"                      \n\t"
        "mov (%%"REG_D"), %%"REG_D"             \n\t"
        "add %3, %%"REG_D"                      \n\t"

        load_block_twolines(3,     "%%xmm1", "%%xmm5")
        load_obmc_twolines (0, 16, "%%xmm0", "%%xmm4")
        "pmullw %%xmm0, %%xmm1                  \n\t"
        "pmullw %%xmm4, %%xmm5                  \n\t"

        load_block_twolines(2,     "%%xmm2", "%%xmm6")
        load_obmc_twolines (8, 16, "%%xmm0", "%%xmm4")
        "pmullw  %%xmm0, %%xmm2                  \n\t"
        "pmullw  %%xmm4, %%xmm6                  \n\t"
        "paddusw %%xmm2, %%xmm1                  \n\t"
        "paddusw %%xmm6, %%xmm5                  \n\t"

        load_block_twolines(1,        "%%xmm2", "%%xmm6")
        load_obmc_twolines (128,  16, "%%xmm0", "%%xmm4")
        "pmullw  %%xmm0, %%xmm2                  \n\t"
        "pmullw  %%xmm4, %%xmm6                  \n\t"
        "paddusw %%xmm2, %%xmm1                  \n\t"
        "paddusw %%xmm6, %%xmm5                  \n\t"

        load_block_twolines(0,        "%%xmm2", "%%xmm6")
        load_obmc_twolines (136,  16, "%%xmm0", "%%xmm4")
        "pmullw  %%xmm0, %%xmm2                  \n\t"
        "pmullw  %%xmm4, %%xmm6                  \n\t"
        "paddusw %%xmm2, %%xmm1                  \n\t"
        "paddusw %%xmm6, %%xmm5                  \n\t"

        "mov %0, %%"REG_d"                       \n\t"

        "movdqu (%%"REG_D"), %%xmm0              \n\t"
        "psrlw $4, %%xmm1                        \n\t"
        "paddw %%xmm3, %%xmm1                    \n\t"
        "paddw %%xmm1, %%xmm0                    \n\t"

        "mov %1, %%"REG_D"                       \n\t"
        "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D"    \n\t"
        "add %3, %%"REG_D"                       \n\t"

        "movdqu (%%"REG_D"), %%xmm4              \n\t"
        "psrlw $4, %%xmm5                        \n\t"
        "paddw %%xmm3, %%xmm5                    \n\t"
        "paddw %%xmm5, %%xmm4                    \n\t"

        "psraw $4, %%xmm0                        \n\t"
        "psraw $4, %%xmm4                        \n\t"
        "packuswb %%xmm4, %%xmm0                 \n\t"
        "movq   %%xmm0, (%%"REG_d")              \n\t"
        "movhpd %%xmm0, (%%"REG_d",%%"REG_c")    \n\t"

        "sal $1, %%"REG_c"                       \n\t"
        "add $"PTR_SIZE"*2, %1                   \n\t"
        "add $16*2, %%"REG_S"                    \n\t"
        "add %%"REG_c", %0                       \n\t"
        inc_block("%%"REG_c)
        "sar $1, %%"REG_c"                       \n\t"
        "sub $2, %2                              \n\t"
        "jnz 1b                                  \n\t"
    : "+m"(dst8), "+m"(dst_array), "=&r"(tmp)
    : "rm"((long)(src_x<<1)), "m"(obmc), "a"(block), "m"((long)b_h), "m"((long)src_stride)
    : "%"REG_c,"%"REG_S,"%"REG_D,"%"REG_d);
}

Greetings,
Reimar D?ffinger