[FFmpeg-devel] [FFMpeg-Devel] [PATCH 4/7] Replaced BLOCK_HEIGHT macro with block_height and block_width variables

Fri Mar 13 19:15:11 CET 2015

This change is to allow support for different sized blocks, which will
be necessary for sse and avx. My plan is for the code to still act on
8x8 blocks, but to process multiple 8x8 blocks in parallel when using
sse/avx.
---
 libpostproc/postprocess.c          |  3 ---
 libpostproc/postprocess_c.c        | 36 ++++++++++++++++++------------------
 libpostproc/postprocess_internal.h | 17 ++++++++++++++++-
 libpostproc/postprocess_template.c | 18 +++++++++---------
 4 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
index 2cdd988..3090869 100644
--- a/libpostproc/postprocess.c
+++ b/libpostproc/postprocess.c
@@ -115,9 +115,6 @@ const char *postproc_license(void)
 
 #define GET_MODE_BUFFER_SIZE 500
 #define OPTIONS_ARRAY_SIZE 10
-#define BLOCK_SIZE 8
-#define TEMP_STRIDE 8
-//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 
 #if ARCH_X86 && HAVE_INLINE_ASM
 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
diff --git a/libpostproc/postprocess_c.c b/libpostproc/postprocess_c.c
index 3d3b738..5660c64 100644
--- a/libpostproc/postprocess_c.c
+++ b/libpostproc/postprocess_c.c
@@ -32,7 +32,7 @@ static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *
     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
     const int dcThreshold= dcOffset*2 + 1;
 
-    for(y=0; y<BLOCK_SIZE; y++){
+    for(y=0; y<block_height; y++){
         numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
         numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
         numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
@@ -56,7 +56,7 @@ static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c
     const int dcThreshold= dcOffset*2 + 1;
 
     src+= stride*4; // src points to begin of the 8x8 Block
-    for(y=0; y<BLOCK_SIZE-1; y++){
+    for(y=0; y<block_height-1; y++){
         numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
         numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
         numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
@@ -90,7 +90,7 @@ static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
 {
     int x;
     src+= stride*4;
-    for(x=0; x<BLOCK_SIZE; x+=4){
+    for(x=0; x<block_width; x+=4){
         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
@@ -120,7 +120,7 @@ static inline int vertClassify_C(const uint8_t src[], int stride, const PPContex
 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
 {
     int y;
-    for(y=0; y<BLOCK_SIZE; y++){
+    for(y=0; y<block_height; y++){
         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 
         if(FFABS(middleEnergy) < 8*c->QP){
@@ -159,7 +159,7 @@ static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext
 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
 {
     int y;
-    for(y=0; y<BLOCK_SIZE; y++){
+    for(y=0; y<block_height; y++){
         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 
@@ -229,7 +229,7 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
         }
     }
 
-    for(y=0; y<BLOCK_SIZE; y++){
+    for(y=0; y<block_height; y++){
         int a= src[1] - src[2];
         int b= src[3] - src[4];
         int c= src[5] - src[6];
@@ -392,7 +392,7 @@ static inline void doVertLowPass_C(uint8_t *src, int stride, PPContext *c)
     const int l9= stride + l8;
     int x;
     src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
+    for(x=0; x<block_width; x++){
         const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
         const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
 
@@ -443,7 +443,7 @@ static inline void vertX1Filter_C(uint8_t *src, int stride, PPContext *co)
     int x;
 
     src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
+    for(x=0; x<block_width; x++){
         int a= src[l3] - src[l4];
         int b= src[l4] - src[l5];
         int c= src[l5] - src[l6];
@@ -478,7 +478,7 @@ static inline void doVertDefFilter_C(uint8_t src[], int stride, PPContext *c)
 //    const int l9= stride + l8;
     int x;
     src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
+    for(x=0; x<block_width; x++){
         const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
         if(FFABS(middleEnergy) < 8*c->QP){
             const int q=(src[l4] - src[l5])/2;
@@ -881,13 +881,13 @@ static inline void blockCopy_C(uint8_t dst[], int dstStride, const uint8_t src[]
 {
     int i;
     if(levelFix){
-    for(i=0; i<8; i++)
+    for(i=0; i<block_height; i++)
         memcpy( &(dst[dstStride*i]),
-                &(src[srcStride*i]), BLOCK_SIZE);
+                &(src[srcStride*i]), block_width);
     }else{
-    for(i=0; i<8; i++)
+    for(i=0; i<block_height; i++)
         memcpy( &(dst[dstStride*i]),
-                &(src[srcStride*i]), BLOCK_SIZE);
+                &(src[srcStride*i]), block_width);
     }
 }
 
@@ -908,7 +908,7 @@ static inline void duplicate_C(uint8_t src[], int stride)
  * Filter array of bytes (Y or U or V values)
  */
 static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-                                const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
+                          const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
 {
     DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
     int x,y;
@@ -999,7 +999,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
     }
 
     /* copy & deinterlace first row of blocks */
-    y=-BLOCK_SIZE;
+    y=-block_height;
     {
         const uint8_t *srcBlock= &(src[y*srcStride]);
         uint8_t *dstBlock= tempDst + dstStride;
@@ -1007,7 +1007,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
         // From this point on it is guaranteed that we can read and write 16 lines downward
         // finish 1 block before the next otherwise we might have a problem
         // with the L1 Cache of the P4 ... or only a few blocks at a time or something
-        for(x=0; x<width; x+=BLOCK_SIZE){
+        for(x=0; x<width; x+=block_width){
 
 
             blockCopy_C(dstBlock + dstStride*8, dstStride,
@@ -1043,7 +1043,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
         }
     }
 
-    for(y=0; y<height; y+=BLOCK_SIZE){
+    for(y=0; y<height; y+=block_height){
         //1% speedup if these are here instead of the inner loop
         const uint8_t *srcBlock= &(src[y*srcStride]);
         uint8_t *dstBlock= &(dst[y*dstStride]);
@@ -1077,7 +1077,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
         // From this point on it is guaranteed that we can read and write 16 lines downward
         // finish 1 block before the next otherwise we might have a problem
         // with the L1 Cache of the P4 ... or only a few blocks at a time or something
-        for(x=0; x<width; x+=BLOCK_SIZE){
+        for(x=0; x<width; x+=block_width){
             const int stride= dstStride;
             if(isColor){
                 QP= QPptr[x>>qpHShift];
diff --git a/libpostproc/postprocess_internal.h b/libpostproc/postprocess_internal.h
index 1ebd974..5a7be1f 100644
--- a/libpostproc/postprocess_internal.h
+++ b/libpostproc/postprocess_internal.h
@@ -174,5 +174,20 @@ static inline void linecpy(void *dest, const void *src, int lines, int stride) {
         memcpy((uint8_t*)dest+(lines-1)*stride, (const uint8_t*)src+(lines-1)*stride, -lines*stride);
     }
 }
-
+/*
+   Currently blocks are always 8xN bytes, where N is determined by the size of
+   the simd registers being used
+*/
+static const int block_height = 8;
+#if ARCH_X86 && !CONFIG_RUNTIME_CPUDETECT
+#if HAVE_AVX2
+static const int block_width = 32;
+#elif HAVE_SSE2
+static const int block_width = 16;
+#else
+static const int block_width = 8;
+#endif
+#else
+static int block_width; //determined at runtime
+#endif
 #endif /* POSTPROC_POSTPROCESS_INTERNAL_H */
diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c
index 25f60ab..cafc9c6 100644
--- a/libpostproc/postprocess_template.c
+++ b/libpostproc/postprocess_template.c
@@ -364,7 +364,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
     const int l9= stride + l8;
     int x;
     src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
+    for(x=0; x<block_width; x++){
         const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
         const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
 
@@ -505,7 +505,7 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
     int x;
 
     src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
+    for(x=0; x<block_width; x++){
         int a= src[l3] - src[l4];
         int b= src[l4] - src[l5];
         int c= src[l5] - src[l6];
@@ -1057,7 +1057,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 //    const int l9= stride + l8;
     int x;
     src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
+    for(x=0; x<block_width; x++){
         const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
         if(FFABS(middleEnergy) < 8*c->QP){
             const int q=(src[l4] - src[l5])/2;
@@ -3175,7 +3175,7 @@ SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
 #else //TEMPLATE_PP_MMX && HAVE_6REGS
     for(i=0; i<8; i++)
         memcpy( &(dst[dstStride*i]),
-                &(src[srcStride*i]), BLOCK_SIZE);
+                &(src[srcStride*i]), block_width);
 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
     }else{
 #if TEMPLATE_PP_MMX && HAVE_6REGS
@@ -3208,7 +3208,7 @@ SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
 #else //TEMPLATE_PP_MMX && HAVE_6REGS
     for(i=0; i<8; i++)
         memcpy( &(dst[dstStride*i]),
-                &(src[srcStride*i]), BLOCK_SIZE);
+                &(src[srcStride*i]), block_width);
 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
     }
 }
@@ -3357,7 +3357,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
     }
 
     /* copy & deinterlace first row of blocks */
-    y=-BLOCK_SIZE;
+    y=-block_height;
     {
         const uint8_t *srcBlock= &(src[y*srcStride]);
         uint8_t *dstBlock= tempDst + dstStride;
@@ -3365,7 +3365,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
         // From this point on it is guaranteed that we can read and write 16 lines downward
         // finish 1 block before the next otherwise we might have a problem
         // with the L1 Cache of the P4 ... or only a few blocks at a time or something
-        for(x=0; x<width; x+=BLOCK_SIZE){
+        for(x=0; x<width; x+=block_width){
 
 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
 /*
@@ -3436,7 +3436,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
         }
     }
 
-    for(y=0; y<height; y+=BLOCK_SIZE){
+    for(y=0; y<height; y+=block_height){
         //1% speedup if these are here instead of the inner loop
         const uint8_t *srcBlock= &(src[y*srcStride]);
         uint8_t *dstBlock= &(dst[y*dstStride]);
@@ -3474,7 +3474,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
         // From this point on it is guaranteed that we can read and write 16 lines downward
         // finish 1 block before the next otherwise we might have a problem
         // with the L1 Cache of the P4 ... or only a few blocks at a time or something
-        for(x=0; x<width; x+=BLOCK_SIZE){
+        for(x=0; x<width; x+=block_width){
             const int stride= dstStride;
 #if TEMPLATE_PP_MMX
             uint8_t *tmpXchg;
-- 
2.2.1