[FFmpeg-cvslog] x86: hpeldsp: implement SSE2 versions
Christophe Gisquet
git at videolan.org
Sat May 24 03:43:30 CEST 2014
ffmpeg | branch: master | Christophe Gisquet <christophe.gisquet at gmail.com> | Thu May 22 17:48:19 2014 +0000| [f0aca50e0b21d7c97b091f8e551719e0da574e12] | committer: Michael Niedermayer
x86: hpeldsp: implement SSE2 versions
Those are mostly used in codecs older than H.264, eg MPEG-2.
put16 versions:
mmx mmx2 sse2
x2: 1888 1185 552
y2: 1778 1092 510
avg16 xy2: 3509(mmx2) -> 2169(sse2)
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f0aca50e0b21d7c97b091f8e551719e0da574e12
---
libavcodec/x86/hpeldsp.asm | 115 ++++++++++++++++++++++++++++++-----------
libavcodec/x86/hpeldsp_init.c | 15 ++++++
2 files changed, 100 insertions(+), 30 deletions(-)
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 2adead2..1d26c45 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -35,21 +35,39 @@ SECTION_TEXT
; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_x2, 4,5,4
+%else
cglobal put_pixels8_x2, 4,5
+%endif
lea r4, [r2*2]
.loop:
- mova m0, [r1]
- mova m1, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
+ movu m0, [r1+1]
+ movu m1, [r1+r2+1]
+%if cpuflag(sse2)
+ movu m2, [r1]
+ movu m3, [r1+r2]
+ pavgb m0, m2
+ pavgb m1, m3
+%else
+ PAVGB m0, [r1]
+ PAVGB m1, [r1+r2]
+%endif
mova [r0], m0
mova [r0+r2], m1
add r1, r4
add r0, r4
- mova m0, [r1]
- mova m1, [r1+r2]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
+ movu m0, [r1+1]
+ movu m1, [r1+r2+1]
+%if cpuflag(sse2)
+ movu m2, [r1]
+ movu m3, [r1+r2]
+ pavgb m0, m2
+ pavgb m1, m3
+%else
+ PAVGB m0, [r1]
+ PAVGB m1, [r1+r2]
+%endif
add r1, r4
mova [r0], m0
mova [r0+r2], m1
@@ -107,6 +125,9 @@ INIT_MMX mmxext
PUT_PIXELS_16
INIT_MMX 3dnow
PUT_PIXELS_16
+; The 8_X2 macro can easily be used here
+INIT_XMM sse2
+PUT_PIXELS8_X2
; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -199,20 +220,24 @@ PUT_NO_RND_PIXELS8_X2_EXACT
; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_y2, 4,5,3
+%else
cglobal put_pixels8_y2, 4,5
+%endif
lea r4, [r2*2]
- mova m0, [r1]
+ movu m0, [r1]
sub r0, r2
.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
+ movu m1, [r1+r2]
+ movu m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
+ movu m1, [r1+r2]
+ movu m0, [r1+r4]
add r0, r4
add r1, r4
PAVGB m2, m1
@@ -229,6 +254,9 @@ INIT_MMX mmxext
PUT_PIXELS8_Y2
INIT_MMX 3dnow
PUT_PIXELS8_Y2
+; actually, put_pixels16_y2_sse2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -352,34 +380,50 @@ AVG_PIXELS8
%endmacro
%macro AVG_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_x2, 4,5,4
+%else
cglobal avg_pixels8_x2, 4,5
+%endif
lea r4, [r2*2]
%if notcpuflag(mmxext)
pcmpeqd m5, m5
paddb m5, m5
%endif
.loop:
- mova m0, [r1]
- mova m2, [r1+r2]
+ movu m0, [r1]
+ movu m2, [r1+r2]
%if notcpuflag(mmxext)
PAVGB_MMX [r1+1], m0, m3, m5
PAVGB_MMX [r1+r2+1], m2, m4, m5
PAVGB_MMX [r0], m0, m3, m5
PAVGB_MMX [r0+r2], m2, m4, m5
%else
+%if cpuflag(sse2)
+ movu m1, [r1+1]
+ movu m3, [r1+r2+1]
+ pavgb m0, m1
+ pavgb m2, m3
+%else
PAVGB m0, [r1+1]
PAVGB m2, [r1+r2+1]
+%endif
PAVGB m0, [r0]
PAVGB m2, [r0+r2]
%endif
add r1, r4
mova [r0], m0
mova [r0+r2], m2
- mova m0, [r1]
- mova m2, [r1+r2]
+ movu m0, [r1]
+ movu m2, [r1+r2]
%if notcpuflag(mmxext)
PAVGB_MMX [r1+1], m0, m3, m5
PAVGB_MMX [r1+r2+1], m2, m4, m5
+%elif cpuflag(sse2)
+ movu m1, [r1+1]
+ movu m3, [r1+r2+1]
+ pavgb m0, m1
+ pavgb m2, m3
%else
PAVGB m0, [r1+1]
PAVGB m2, [r1+r2+1]
@@ -389,6 +433,9 @@ cglobal avg_pixels8_x2, 4,5
%if notcpuflag(mmxext)
PAVGB_MMX [r0], m0, m3, m5
PAVGB_MMX [r0+r2], m2, m4, m5
+%elif cpuflag(sse2)
+ pavgb m0, [r0]
+ pavgb m2, [r0+r2]
%else
PAVGB m0, [r0]
PAVGB m2, [r0+r2]
@@ -407,36 +454,39 @@ INIT_MMX mmxext
AVG_PIXELS8_X2
INIT_MMX 3dnow
AVG_PIXELS8_X2
+; actually avg_pixels16_x2
+INIT_XMM sse2
+AVG_PIXELS8_X2
; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_y2, 4,5,3
+%else
cglobal avg_pixels8_y2, 4,5
+%endif
lea r4, [r2*2]
- mova m0, [r1]
+ movu m0, [r1]
sub r0, r2
.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
+ movu m1, [r1+r2]
+ movu m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
- mova m3, [r0+r2]
- mova m4, [r0+r4]
- PAVGB m0, m3
- PAVGB m1, m4
+ PAVGB m0, [r0+r2]
+ PAVGB m1, [r0+r4]
mova [r0+r2], m0
mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
+ movu m1, [r1+r2]
+ movu m0, [r1+r4]
PAVGB m2, m1
PAVGB m1, m0
add r0, r4
add r1, r4
- mova m3, [r0+r2]
- mova m4, [r0+r4]
- PAVGB m2, m3
- PAVGB m1, m4
+ PAVGB m2, [r0+r2]
+ PAVGB m1, [r0+r4]
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
@@ -449,6 +499,9 @@ INIT_MMX mmxext
AVG_PIXELS8_Y2
INIT_MMX 3dnow
AVG_PIXELS8_Y2
+; actually avg_pixels16_y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -571,3 +624,5 @@ INIT_MMX mmxext
AVG_PIXELS_XY2
INIT_MMX 3dnow
AVG_PIXELS_XY2
+INIT_XMM sse2
+AVG_PIXELS_XY2
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 5e2ecb5..05bd561 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -40,6 +40,16 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -284,7 +294,12 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
// these functions are slower than mmx on AMD, but faster on Intel
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
}
#endif /* HAVE_SSE2_EXTERNAL */
}
More information about the ffmpeg-cvslog
mailing list