[FFmpeg-devel] [PATCH] snow SSE2 add_yblock

Reimar Döffinger Reimar.Doeffinger
Thu Aug 30 16:56:41 CEST 2007

attached patch should have a working version.
I have replaced several of the hardcoded registers by something more
flexible because I found it also nicer to read.
Suggestions welcome (though optimizations IMO should be done after
applying and reenabling).
And better don't try to read the patch but apply and read the resulting
asm, diff made something quite butchered out of this.

Reimar D?ffinger
-------------- next part --------------
Index: libavcodec/i386/snowdsp_mmx.c
--- libavcodec/i386/snowdsp_mmx.c	(revision 10261)
+++ libavcodec/i386/snowdsp_mmx.c	(working copy)
@@ -603,157 +603,173 @@
 #endif //HAVE_7REGS
-#define snow_inner_add_yblock_sse2_header \
-    IDWTELEM * * dst_array = sb->line + src_y;\
-    long tmp;\
-    asm volatile(\
-             "mov  %7, %%"REG_c"             \n\t"\
-             "mov  %6, %2                    \n\t"\
-             "mov  %4, %%"REG_S"             \n\t"\
-             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
-             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
-             "psllw $15, %%xmm3              \n\t"\
-             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
-             "1:                             \n\t"\
-             "mov %1, %%"REG_D"              \n\t"\
-             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
-             "add %3, %%"REG_D"              \n\t"
+#define load_block_line(block, dst1, dst2) \
+             "mov "PTR_SIZE"*"#block"(%%"REG_a"), %%"REG_d" \n\t"\
+             "movq  (%%"REG_d"), "dst1"                     \n\t"\
+             "movq 8(%%"REG_d"), "dst2"                     \n\t"\
+             "punpcklbw          %%xmm7, "dst1"             \n\t"\
+             "punpcklbw          %%xmm7, "dst2"             \n\t"
-#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
-             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
-             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
-             "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
-             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
-             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
-             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
-             "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
-             "punpcklbw %%xmm7, %%xmm0       \n\t"\
-             "punpcklbw %%xmm7, %%xmm4       \n\t"\
-             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
-             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+#define load_block_twolines(block, stride, dst1, dst2) \
+             "mov "PTR_SIZE"*"#block"(%%"REG_a"), %%"REG_d" \n\t"\
+             "movq (%%"REG_d"          ), "dst1"            \n\t"\
+             "movq (%%"REG_d", "stride"), "dst2"            \n\t"\
+             "punpcklbw          %%xmm7, "dst1"             \n\t"\
+             "punpcklbw          %%xmm7, "dst2"             \n\t"
-#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
-             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
-             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
-             "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
-             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
-             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
-             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
-             "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
-             "punpcklbw %%xmm7, %%xmm0       \n\t"\
-             "punpcklbw %%xmm7, %%xmm4       \n\t"\
-             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
-             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+#define load_obmc_twolines(obmc, offset, stride, dst1, dst2) \
+             "movq           "#offset"("obmc"), "dst1"      \n\t"\
+             "movq "#stride"+"#offset"("obmc"), "dst2"      \n\t"\
+             "punpcklbw          %%xmm7, "dst1"             \n\t"\
+             "punpcklbw          %%xmm7, "dst2"             \n\t"
-#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
-             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
-             "paddusw %%xmm2, %%xmm1         \n\t"\
-             "paddusw %%xmm6, %%xmm5         \n\t"
+#define inc_block(delta) \
+             "add "delta", "PTR_SIZE"*3(%%"REG_a")  \n\t"\
+             "add "delta", "PTR_SIZE"*2(%%"REG_a")  \n\t"\
+             "add "delta", "PTR_SIZE"*1(%%"REG_a")  \n\t"\
+             "add "delta", (%%"REG_a")              \n\t"
-#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
-             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
-             "paddusw %%xmm2, %%xmm1         \n\t"\
-             "paddusw %%xmm6, %%xmm5         \n\t"
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    IDWTELEM **dst_array = sb->line + src_y;
+    asm volatile (
+        "pxor %%xmm7, %%xmm7                    \n\t"
+        "pcmpeqd %%xmm3, %%xmm3                 \n\t"
+        "psllw $15, %%xmm3                      \n\t"
+        "psrlw $12, %%xmm3                      \n\t"
+        "1:                                     \n\t"
+        "mov %1, %%"REG_D"                      \n\t"
+        "mov (%%"REG_D"), %%"REG_D"             \n\t"
+        "add %5, %%"REG_D"                      \n\t"
-#define snow_inner_add_yblock_sse2_end_common1\
-             "add $32, %%"REG_S"             \n\t"\
-             "add %%"REG_c", %0              \n\t"\
-             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
-             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
-             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
-             "add %%"REG_c", (%%"REG_a")     \n\t"
+        load_block_twolines(3,       "%4", "%%xmm1", "%%xmm5")
+        load_obmc_twolines ("%2",   0, 16, "%%xmm0", "%%xmm4")
+        "pmullw %%xmm0, %%xmm1                  \n\t"
+        "pmullw %%xmm4, %%xmm5                  \n\t"
-#define snow_inner_add_yblock_sse2_end_common2\
-             "jnz 1b                         \n\t"\
-             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
-             :\
-             "rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
-             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+        load_block_twolines(2,       "%4", "%%xmm2", "%%xmm6")
+        load_obmc_twolines ("%2",   8, 16, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddw   %%xmm2, %%xmm1                  \n\t"
+        "paddw   %%xmm6, %%xmm5                  \n\t"
-#define snow_inner_add_yblock_sse2_end_8\
-             "sal $1, %%"REG_c"              \n\t"\
-             "add $"PTR_SIZE"*2, %1          \n\t"\
-             snow_inner_add_yblock_sse2_end_common1\
-             "sar $1, %%"REG_c"              \n\t"\
-             "sub $2, %2                     \n\t"\
-             snow_inner_add_yblock_sse2_end_common2
+        load_block_twolines(1,       "%4", "%%xmm2", "%%xmm6")
+        load_obmc_twolines ("%2", 128, 16, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddw   %%xmm2, %%xmm1                  \n\t"
+        "paddw   %%xmm6, %%xmm5                  \n\t"
-#define snow_inner_add_yblock_sse2_end_16\
-             "add $"PTR_SIZE"*1, %1          \n\t"\
-             snow_inner_add_yblock_sse2_end_common1\
-             "dec %2                         \n\t"\
-             snow_inner_add_yblock_sse2_end_common2
+        load_block_twolines(0,       "%4", "%%xmm2", "%%xmm6")
+        load_obmc_twolines ("%2", 136, 16, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddw   %%xmm2, %%xmm1                  \n\t"
+        "paddw   %%xmm6, %%xmm5                  \n\t"
-static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
-                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
-snow_inner_add_yblock_sse2_accum_8("2", "8")
-snow_inner_add_yblock_sse2_accum_8("1", "128")
-snow_inner_add_yblock_sse2_accum_8("0", "136")
+        "mov %0, %%"REG_d"                       \n\t"
-             "mov %0, %%"REG_d"              \n\t"
-             "movdqa (%%"REG_D"), %%xmm0     \n\t"
-             "movdqa %%xmm1, %%xmm2          \n\t"
+        "movdqu (%%"REG_D"), %%xmm2              \n\t"
-             "punpckhwd %%xmm7, %%xmm1       \n\t"
-             "punpcklwd %%xmm7, %%xmm2       \n\t"
-             "paddd %%xmm2, %%xmm0           \n\t"
-             "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
-             "paddd %%xmm1, %%xmm2           \n\t"
-             "paddd %%xmm3, %%xmm0           \n\t"
-             "paddd %%xmm3, %%xmm2           \n\t"
+        "psrlw $4, %%xmm1                        \n\t"
+        "psrlw $4, %%xmm5                        \n\t"
-             "mov %1, %%"REG_D"              \n\t"
-             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
-             "add %3, %%"REG_D"              \n\t"
+        "mov %1, %%"REG_D"                       \n\t"
+        "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D"    \n\t"
+        "add %5, %%"REG_D"                       \n\t"
-             "movdqa (%%"REG_D"), %%xmm4     \n\t"
-             "movdqa %%xmm5, %%xmm6          \n\t"
-             "punpckhwd %%xmm7, %%xmm5       \n\t"
-             "punpcklwd %%xmm7, %%xmm6       \n\t"
-             "paddd %%xmm6, %%xmm4           \n\t"
-             "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
-             "paddd %%xmm5, %%xmm6           \n\t"
-             "paddd %%xmm3, %%xmm4           \n\t"
-             "paddd %%xmm3, %%xmm6           \n\t"
+        "movdqu (%%"REG_D"), %%xmm6              \n\t"
-             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
-             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
-             "packssdw %%xmm2, %%xmm0        \n\t"
-             "packuswb %%xmm7, %%xmm0        \n\t"
-             "movq %%xmm0, (%%"REG_d")       \n\t"
+        "paddw   %%xmm2, %%xmm1                  \n\t"
+        "paddw   %%xmm6, %%xmm5                  \n\t"
-             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
-             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
-             "packssdw %%xmm6, %%xmm4        \n\t"
-             "packuswb %%xmm7, %%xmm4        \n\t"
-             "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
+        "paddw   %%xmm3, %%xmm1                  \n\t"
+        "paddw   %%xmm3, %%xmm5                  \n\t"
+        "psraw $4, %%xmm1                        \n\t"
+        "psraw $4, %%xmm5                        \n\t"
+        "packuswb %%xmm5, %%xmm1                 \n\t"
+        "movq   %%xmm1, (%%"REG_d")              \n\t"
+        "movhps %%xmm1, (%%"REG_d", %4)          \n\t"
+        "sal $1, %4                              \n\t"
+        "add $"PTR_SIZE"*2, %1                   \n\t"
+        "add $16*2, %2                           \n\t"
+        "add %4, %0                              \n\t"
+        inc_block("%4")
+        "sar $1, %4                              \n\t"
+        "sub $2, %3                              \n\t"
+        "jnz 1b                                  \n\t"
+    : "+m"(dst8), "+m"(dst_array),"+r"(obmc), "+rm"((long)b_h),
+      "+r"((long)src_stride)
+    : "rm"((long)(src_x<<1)), "a"(block)
+    : "%"REG_D,"%"REG_d);
 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
                       int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
-snow_inner_add_yblock_sse2_accum_16("2", "16")
-snow_inner_add_yblock_sse2_accum_16("1", "512")
-snow_inner_add_yblock_sse2_accum_16("0", "528")
+    IDWTELEM **dst_array = sb->line + src_y;
+    asm volatile (
+        "pxor %%xmm7, %%xmm7                    \n\t"
+        "pcmpeqd %%xmm3, %%xmm3                 \n\t"
+        "psllw $15, %%xmm3                      \n\t"
+        "psrlw $12, %%xmm3                      \n\t"
+        "1:                                     \n\t"
+        "mov %1, %%"REG_D"                      \n\t"
+        "mov (%%"REG_D"), %%"REG_D"             \n\t"
+        "add %5, %%"REG_D"                      \n\t"
-             "mov %0, %%"REG_d"              \n\t"
-             "psrlw $4, %%xmm1               \n\t"
-             "psrlw $4, %%xmm5               \n\t"
-             "paddw   (%%"REG_D"), %%xmm1    \n\t"
-             "paddw 16(%%"REG_D"), %%xmm5    \n\t"
-             "paddw %%xmm3, %%xmm1           \n\t"
-             "paddw %%xmm3, %%xmm5           \n\t"
-             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
-             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
-             "packuswb %%xmm5, %%xmm1        \n\t"
+        load_block_line   (3,            "%%xmm1", "%%xmm5")
+        load_obmc_twolines("%2", 0,   8, "%%xmm0", "%%xmm4")
+        "pmullw %%xmm0, %%xmm1                  \n\t"
+        "pmullw %%xmm4, %%xmm5                  \n\t"
-             "movdqu %%xmm1, (%%"REG_d")       \n\t"
+        load_block_line   (2,             "%%xmm2", "%%xmm6")
+        load_obmc_twolines("%2", 16,   8, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddusw %%xmm2, %%xmm1                  \n\t"
+        "paddusw %%xmm6, %%xmm5                  \n\t"
+        load_block_line   (1,            "%%xmm2", "%%xmm6")
+        load_obmc_twolines("%2", 512, 8, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddusw %%xmm2, %%xmm1                  \n\t"
+        "paddusw %%xmm6, %%xmm5                  \n\t"
+        load_block_line   (0,            "%%xmm2", "%%xmm6")
+        load_obmc_twolines("%2", 528, 8, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddusw %%xmm2, %%xmm1                  \n\t"
+        "paddusw %%xmm6, %%xmm5                  \n\t"
+        "mov %0, %%"REG_d"                       \n\t"
+        "psrlw $4, %%xmm1                        \n\t"
+        "psrlw $4, %%xmm5                        \n\t"
+        "paddw %%xmm3, %%xmm1                    \n\t"
+        "paddw %%xmm3, %%xmm5                    \n\t"
+        "paddw   (%%"REG_D"), %%xmm1             \n\t"
+        "paddw 16(%%"REG_D"), %%xmm5             \n\t"
+        "psraw $4, %%xmm1                        \n\t"
+        "psraw $4, %%xmm5                        \n\t"
+        "packuswb %%xmm5, %%xmm1                 \n\t"
+        "movdqu   %%xmm1, (%%"REG_d")            \n\t"
+        "add $"PTR_SIZE", %1                     \n\t"
+        "add $16*2, %2                           \n\t"
+        "add %4, %0                              \n\t"
+        inc_block("%4")
+        "dec %3                                  \n\t"
+        "jnz 1b                                  \n\t"
+    : "+m"(dst8), "+m"(dst_array), "+r"(obmc), "+rm"((long)b_h),
+      "+r"((long)src_stride)
+    : "rm"((long)(src_x<<1)), "a"(block)
+    : "%"REG_D,"%"REG_d);
 #define snow_inner_add_yblock_mmx_header \

More information about the ffmpeg-devel mailing list