[FFmpeg-devel] [PATCH] ppc: replace vec_ld(0), vec_ld(1) by VEC_LD2() which has fewer loads

Michael Niedermayer michaelni at gmx.at
Fri Nov 7 15:12:19 CET 2014


This needs to be benchmarked, i do not have ppc hw
This is on big endian more similar to how the code was before 79e0255956bc8fcdb143f39b2e45db77144ac017

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
---
 libavcodec/ppc/hpeldsp_altivec.c |   30 ++++++++++--------------------
 libavutil/ppc/util_altivec.h     |   16 ++++++++++++++++
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/libavcodec/ppc/hpeldsp_altivec.c b/libavcodec/ppc/hpeldsp_altivec.c
index 87a1f05..05d8b81 100644
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@ -123,8 +123,7 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    pixelsv1 = VEC_LD(0, pixels);
-    pixelsv2 = VEC_LD(1, pixels);
+    VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
     pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
 
@@ -136,8 +135,7 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        pixelsv1 = unaligned_load(line_size, pixels);
-        pixelsv2 = unaligned_load(line_size+1, pixels);
+        VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
         pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
         pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -171,8 +169,7 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    pixelsv1 = VEC_LD(0, pixels);
-    pixelsv2 = VEC_LD(1, pixels);
+    VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
     pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
@@ -183,8 +180,7 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        pixelsv1 = unaligned_load(line_size, pixels);
-        pixelsv2 = unaligned_load(line_size+1, pixels);
+        VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
         pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
         pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -218,8 +214,7 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    pixelsv1 = VEC_LD(0, pixels);
-    pixelsv2 = VEC_LD(1, pixels);
+    VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
     pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
     pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
@@ -234,8 +229,7 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
     for (i = 0; i < h ; i++) {
         blockv = vec_ld(0, block);
 
-        pixelsv1 = unaligned_load(line_size, pixels);
-        pixelsv2 = unaligned_load(line_size+1, pixels);
+        VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
 
         pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
         pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
@@ -274,8 +268,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    pixelsv1 = VEC_LD(0, pixels);
-    pixelsv2 = VEC_LD(1, pixels);
+    VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
     pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
     pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
@@ -288,8 +281,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
     pixelssum1 = vec_add(pixelssum1, vcone);
 
     for (i = 0; i < h ; i++) {
-        pixelsv1 = unaligned_load(line_size, pixels);
-        pixelsv2 = unaligned_load(line_size+1, pixels);
+        VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
 
         pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
         pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
@@ -329,8 +321,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     register const vector unsigned short vctwo = (const vector unsigned short)
                                         vec_splat_u16(2);
 
-    pixelsv1 = VEC_LD(0, pixels);
-    pixelsv2 = VEC_LD(1, pixels);
+    VEC_LD2(pixelsv1, pixelsv2, 0, pixels);
     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
     pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
@@ -341,8 +332,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        pixelsv1 = unaligned_load(line_size, pixels);
-        pixelsv2 = unaligned_load(line_size+1, pixels);
+        VEC_LD2(pixelsv1, pixelsv2, line_size, pixels);
 
         pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
         pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
diff --git a/libavutil/ppc/util_altivec.h b/libavutil/ppc/util_altivec.h
index fd3bfd3..9fda566 100644
--- a/libavutil/ppc/util_altivec.h
+++ b/libavutil/ppc/util_altivec.h
@@ -88,9 +88,25 @@ do { \
 #if HAVE_BIGENDIAN
 #define VEC_LD(offset,b)                                   \
     vec_perm(vec_ld(offset, b), vec_ld(offset+15, b), vec_lvsl(offset, b))
+
+#define VEC_LD2(dst1, dst2, offset, b) do {                             \
+        register vector unsigned char temp1 = vec_ld(offset     , b);   \
+        register vector unsigned char temp2 = vec_ld((offset) + 16, b); \
+        (dst1) = vec_perm(temp1, temp2, vec_lvsl(offset, b));           \
+        if ((((unsigned long)(b + (offset))) & 0x0000000F) ==  0x0000000F) {    \
+            (dst2) = temp2;                                             \
+        } else {                                                        \
+            (dst2) = vec_perm(temp1, temp2, vec_lvsl((offset)+1, b));   \
+        }                                                               \
+    } while(0)
 #else
 #define VEC_LD(offset,b)                                   \
     vec_vsx_ld(offset, b)
+
+#define VEC_LD2(dst1, dst2, offset, b)  do {               \
+        (dst1) = VEC_LD(offset  ,b);                       \
+        (dst2) = VEC_LD((offset)+1,b);                     \
+    } while(0)
 #endif
 
 /** @brief loads unaligned vector @a *src with offset @a offset
-- 
1.7.9.5



More information about the ffmpeg-devel mailing list