[FFmpeg-devel] [PATCH] snow SSE2 add_yblock

Reimar Döffinger Reimar.Doeffinger
Thu Aug 30 16:56:41 CEST 2007


Hello,
attached patch should have a working version.
I have replaced several of the hardcoded registers by something more
flexible because I found it also nicer to read.
Suggestions welcome (though optimizations IMO should be done after
applying and reenabling).
And better don't try to read the patch but apply and read the resulting
asm, diff made something quite butchered out of this.

Greetings,
Reimar D?ffinger
-------------- next part --------------
Index: libavcodec/i386/snowdsp_mmx.c
===================================================================
--- libavcodec/i386/snowdsp_mmx.c	(revision 10261)
+++ libavcodec/i386/snowdsp_mmx.c	(working copy)
@@ -603,157 +603,173 @@
 }
 #endif //HAVE_7REGS
 
-#define snow_inner_add_yblock_sse2_header \
-    IDWTELEM * * dst_array = sb->line + src_y;\
-    long tmp;\
-    asm volatile(\
-             "mov  %7, %%"REG_c"             \n\t"\
-             "mov  %6, %2                    \n\t"\
-             "mov  %4, %%"REG_S"             \n\t"\
-             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
-             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
-             "psllw $15, %%xmm3              \n\t"\
-             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
-             "1:                             \n\t"\
-             "mov %1, %%"REG_D"              \n\t"\
-             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
-             "add %3, %%"REG_D"              \n\t"
+#define load_block_line(block, dst1, dst2) \
+             "mov "PTR_SIZE"*"#block"(%%"REG_a"), %%"REG_d" \n\t"\
+             "movq  (%%"REG_d"), "dst1"                     \n\t"\
+             "movq 8(%%"REG_d"), "dst2"                     \n\t"\
+             "punpcklbw          %%xmm7, "dst1"             \n\t"\
+             "punpcklbw          %%xmm7, "dst2"             \n\t"
 
-#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
-             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
-             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
-             "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
-             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
-             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
-             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
-             "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
-             "punpcklbw %%xmm7, %%xmm0       \n\t"\
-             "punpcklbw %%xmm7, %%xmm4       \n\t"\
-             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
-             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+#define load_block_twolines(block, stride, dst1, dst2) \
+             "mov "PTR_SIZE"*"#block"(%%"REG_a"), %%"REG_d" \n\t"\
+             "movq (%%"REG_d"          ), "dst1"            \n\t"\
+             "movq (%%"REG_d", "stride"), "dst2"            \n\t"\
+             "punpcklbw          %%xmm7, "dst1"             \n\t"\
+             "punpcklbw          %%xmm7, "dst2"             \n\t"
 
-#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
-             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
-             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
-             "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
-             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
-             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
-             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
-             "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
-             "punpcklbw %%xmm7, %%xmm0       \n\t"\
-             "punpcklbw %%xmm7, %%xmm4       \n\t"\
-             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
-             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+#define load_obmc_twolines(obmc, offset, stride, dst1, dst2) \
+             "movq           "#offset"("obmc"), "dst1"      \n\t"\
+             "movq "#stride"+"#offset"("obmc"), "dst2"      \n\t"\
+             "punpcklbw          %%xmm7, "dst1"             \n\t"\
+             "punpcklbw          %%xmm7, "dst2"             \n\t"
 
-#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
-             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
-             "paddusw %%xmm2, %%xmm1         \n\t"\
-             "paddusw %%xmm6, %%xmm5         \n\t"
+#define inc_block(delta) \
+             "add "delta", "PTR_SIZE"*3(%%"REG_a")  \n\t"\
+             "add "delta", "PTR_SIZE"*2(%%"REG_a")  \n\t"\
+             "add "delta", "PTR_SIZE"*1(%%"REG_a")  \n\t"\
+             "add "delta", (%%"REG_a")              \n\t"
 
-#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
-             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
-             "paddusw %%xmm2, %%xmm1         \n\t"\
-             "paddusw %%xmm6, %%xmm5         \n\t"
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
+                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    IDWTELEM **dst_array = sb->line + src_y;
+    asm volatile (
+        "pxor %%xmm7, %%xmm7                    \n\t"
+        "pcmpeqd %%xmm3, %%xmm3                 \n\t"
+        "psllw $15, %%xmm3                      \n\t"
+        "psrlw $12, %%xmm3                      \n\t"
+        "1:                                     \n\t"
+        "mov %1, %%"REG_D"                      \n\t"
+        "mov (%%"REG_D"), %%"REG_D"             \n\t"
+        "add %5, %%"REG_D"                      \n\t"
 
-#define snow_inner_add_yblock_sse2_end_common1\
-             "add $32, %%"REG_S"             \n\t"\
-             "add %%"REG_c", %0              \n\t"\
-             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
-             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
-             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
-             "add %%"REG_c", (%%"REG_a")     \n\t"
+        load_block_twolines(3,       "%4", "%%xmm1", "%%xmm5")
+        load_obmc_twolines ("%2",   0, 16, "%%xmm0", "%%xmm4")
+        "pmullw %%xmm0, %%xmm1                  \n\t"
+        "pmullw %%xmm4, %%xmm5                  \n\t"
 
-#define snow_inner_add_yblock_sse2_end_common2\
-             "jnz 1b                         \n\t"\
-             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
-             :\
-             "rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
-             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+        load_block_twolines(2,       "%4", "%%xmm2", "%%xmm6")
+        load_obmc_twolines ("%2",   8, 16, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddw   %%xmm2, %%xmm1                  \n\t"
+        "paddw   %%xmm6, %%xmm5                  \n\t"
 
-#define snow_inner_add_yblock_sse2_end_8\
-             "sal $1, %%"REG_c"              \n\t"\
-             "add $"PTR_SIZE"*2, %1          \n\t"\
-             snow_inner_add_yblock_sse2_end_common1\
-             "sar $1, %%"REG_c"              \n\t"\
-             "sub $2, %2                     \n\t"\
-             snow_inner_add_yblock_sse2_end_common2
+        load_block_twolines(1,       "%4", "%%xmm2", "%%xmm6")
+        load_obmc_twolines ("%2", 128, 16, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddw   %%xmm2, %%xmm1                  \n\t"
+        "paddw   %%xmm6, %%xmm5                  \n\t"
 
-#define snow_inner_add_yblock_sse2_end_16\
-             "add $"PTR_SIZE"*1, %1          \n\t"\
-             snow_inner_add_yblock_sse2_end_common1\
-             "dec %2                         \n\t"\
-             snow_inner_add_yblock_sse2_end_common2
+        load_block_twolines(0,       "%4", "%%xmm2", "%%xmm6")
+        load_obmc_twolines ("%2", 136, 16, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddw   %%xmm2, %%xmm1                  \n\t"
+        "paddw   %%xmm6, %%xmm5                  \n\t"
 
-static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
-                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-snow_inner_add_yblock_sse2_header
-snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
-snow_inner_add_yblock_sse2_accum_8("2", "8")
-snow_inner_add_yblock_sse2_accum_8("1", "128")
-snow_inner_add_yblock_sse2_accum_8("0", "136")
+        "mov %0, %%"REG_d"                       \n\t"
 
-             "mov %0, %%"REG_d"              \n\t"
-             "movdqa (%%"REG_D"), %%xmm0     \n\t"
-             "movdqa %%xmm1, %%xmm2          \n\t"
+        "movdqu (%%"REG_D"), %%xmm2              \n\t"
 
-             "punpckhwd %%xmm7, %%xmm1       \n\t"
-             "punpcklwd %%xmm7, %%xmm2       \n\t"
-             "paddd %%xmm2, %%xmm0           \n\t"
-             "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
-             "paddd %%xmm1, %%xmm2           \n\t"
-             "paddd %%xmm3, %%xmm0           \n\t"
-             "paddd %%xmm3, %%xmm2           \n\t"
+        "psrlw $4, %%xmm1                        \n\t"
+        "psrlw $4, %%xmm5                        \n\t"
 
-             "mov %1, %%"REG_D"              \n\t"
-             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
-             "add %3, %%"REG_D"              \n\t"
+        "mov %1, %%"REG_D"                       \n\t"
+        "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D"    \n\t"
+        "add %5, %%"REG_D"                       \n\t"
 
-             "movdqa (%%"REG_D"), %%xmm4     \n\t"
-             "movdqa %%xmm5, %%xmm6          \n\t"
-             "punpckhwd %%xmm7, %%xmm5       \n\t"
-             "punpcklwd %%xmm7, %%xmm6       \n\t"
-             "paddd %%xmm6, %%xmm4           \n\t"
-             "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
-             "paddd %%xmm5, %%xmm6           \n\t"
-             "paddd %%xmm3, %%xmm4           \n\t"
-             "paddd %%xmm3, %%xmm6           \n\t"
+        "movdqu (%%"REG_D"), %%xmm6              \n\t"
 
-             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
-             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
-             "packssdw %%xmm2, %%xmm0        \n\t"
-             "packuswb %%xmm7, %%xmm0        \n\t"
-             "movq %%xmm0, (%%"REG_d")       \n\t"
+        "paddw   %%xmm2, %%xmm1                  \n\t"
+        "paddw   %%xmm6, %%xmm5                  \n\t"
 
-             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
-             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
-             "packssdw %%xmm6, %%xmm4        \n\t"
-             "packuswb %%xmm7, %%xmm4        \n\t"
-             "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
-snow_inner_add_yblock_sse2_end_8
+        "paddw   %%xmm3, %%xmm1                  \n\t"
+        "paddw   %%xmm3, %%xmm5                  \n\t"
+        "psraw $4, %%xmm1                        \n\t"
+        "psraw $4, %%xmm5                        \n\t"
+
+        "packuswb %%xmm5, %%xmm1                 \n\t"
+        "movq   %%xmm1, (%%"REG_d")              \n\t"
+        "movhps %%xmm1, (%%"REG_d", %4)          \n\t"
+
+        "sal $1, %4                              \n\t"
+        "add $"PTR_SIZE"*2, %1                   \n\t"
+        "add $16*2, %2                           \n\t"
+        "add %4, %0                              \n\t"
+        inc_block("%4")
+        "sar $1, %4                              \n\t"
+        "sub $2, %3                              \n\t"
+        "jnz 1b                                  \n\t"
+    : "+m"(dst8), "+m"(dst_array),"+r"(obmc), "+rm"((long)b_h),
+      "+r"((long)src_stride)
+    : "rm"((long)(src_x<<1)), "a"(block)
+    : "%"REG_D,"%"REG_d);
 }
 
 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
                       int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
-snow_inner_add_yblock_sse2_header
-snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
-snow_inner_add_yblock_sse2_accum_16("2", "16")
-snow_inner_add_yblock_sse2_accum_16("1", "512")
-snow_inner_add_yblock_sse2_accum_16("0", "528")
+    IDWTELEM **dst_array = sb->line + src_y;
+    asm volatile (
+        "pxor %%xmm7, %%xmm7                    \n\t"
+        "pcmpeqd %%xmm3, %%xmm3                 \n\t"
+        "psllw $15, %%xmm3                      \n\t"
+        "psrlw $12, %%xmm3                      \n\t"
+        "1:                                     \n\t"
+        "mov %1, %%"REG_D"                      \n\t"
+        "mov (%%"REG_D"), %%"REG_D"             \n\t"
+        "add %5, %%"REG_D"                      \n\t"
 
-             "mov %0, %%"REG_d"              \n\t"
-             "psrlw $4, %%xmm1               \n\t"
-             "psrlw $4, %%xmm5               \n\t"
-             "paddw   (%%"REG_D"), %%xmm1    \n\t"
-             "paddw 16(%%"REG_D"), %%xmm5    \n\t"
-             "paddw %%xmm3, %%xmm1           \n\t"
-             "paddw %%xmm3, %%xmm5           \n\t"
-             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
-             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
-             "packuswb %%xmm5, %%xmm1        \n\t"
+        load_block_line   (3,            "%%xmm1", "%%xmm5")
+        load_obmc_twolines("%2", 0,   8, "%%xmm0", "%%xmm4")
+        "pmullw %%xmm0, %%xmm1                  \n\t"
+        "pmullw %%xmm4, %%xmm5                  \n\t"
 
-             "movdqu %%xmm1, (%%"REG_d")       \n\t"
+        load_block_line   (2,             "%%xmm2", "%%xmm6")
+        load_obmc_twolines("%2", 16,   8, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddusw %%xmm2, %%xmm1                  \n\t"
+        "paddusw %%xmm6, %%xmm5                  \n\t"
 
-snow_inner_add_yblock_sse2_end_16
+        load_block_line   (1,            "%%xmm2", "%%xmm6")
+        load_obmc_twolines("%2", 512, 8, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddusw %%xmm2, %%xmm1                  \n\t"
+        "paddusw %%xmm6, %%xmm5                  \n\t"
+
+        load_block_line   (0,            "%%xmm2", "%%xmm6")
+        load_obmc_twolines("%2", 528, 8, "%%xmm0", "%%xmm4")
+        "pmullw  %%xmm0, %%xmm2                  \n\t"
+        "pmullw  %%xmm4, %%xmm6                  \n\t"
+        "paddusw %%xmm2, %%xmm1                  \n\t"
+        "paddusw %%xmm6, %%xmm5                  \n\t"
+
+        "mov %0, %%"REG_d"                       \n\t"
+
+        "psrlw $4, %%xmm1                        \n\t"
+        "psrlw $4, %%xmm5                        \n\t"
+        "paddw %%xmm3, %%xmm1                    \n\t"
+        "paddw %%xmm3, %%xmm5                    \n\t"
+        "paddw   (%%"REG_D"), %%xmm1             \n\t"
+        "paddw 16(%%"REG_D"), %%xmm5             \n\t"
+
+        "psraw $4, %%xmm1                        \n\t"
+        "psraw $4, %%xmm5                        \n\t"
+        "packuswb %%xmm5, %%xmm1                 \n\t"
+        "movdqu   %%xmm1, (%%"REG_d")            \n\t"
+
+        "add $"PTR_SIZE", %1                     \n\t"
+        "add $16*2, %2                           \n\t"
+        "add %4, %0                              \n\t"
+        inc_block("%4")
+        "dec %3                                  \n\t"
+        "jnz 1b                                  \n\t"
+    : "+m"(dst8), "+m"(dst_array), "+r"(obmc), "+rm"((long)b_h),
+      "+r"((long)src_stride)
+    : "rm"((long)(src_x<<1)), "a"(block)
+    : "%"REG_D,"%"REG_d);
 }
 
 #define snow_inner_add_yblock_mmx_header \



More information about the ffmpeg-devel mailing list