[FFmpeg-devel] [PATCH] Use larger tables for yuv > 8 bit to RGB conversion.

Sat Nov 9 16:20:02 CET 2013

This should allow for fairly precise YUV16 to RGB48 conversion
for example.
However I believe that this specific implementation is not as accurate
as it could/should be, i.e. it is buggy.
Also the context gets fairly large this way, so I am not sure if
this is better than the previous shift-based approach.
---
 libswscale/swscale_internal.h |   9 +-
 libswscale/yuv2rgb.c          | 202 ++++++++++++++++++++----------------------
 libswscale/yuv2rgb_template.c |  30 +++----
 3 files changed, 117 insertions(+), 124 deletions(-)

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 6ad278e..5ae1e93 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -38,6 +38,7 @@
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
 #define YUVRGB_TABLE_HEADROOM 128
+#define YUVRGB_TABLE_HEADROOM16 32768
 
 #define MAX_FILTER_SIZE 256
 
@@ -364,10 +365,10 @@ typedef struct SwsContext {
     void *yuvTable;             // pointer to the yuv->rgb table start so it can be freed()
     // alignment ensures the offset can be added in a single
     // instruction on e.g. ARM
-    DECLARE_ALIGNED(16, int, table_gV)[256 + 2*YUVRGB_TABLE_HEADROOM];
-    uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
-    uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
-    uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
+    DECLARE_ALIGNED(16, int, table_gV)[256*256 + 2*YUVRGB_TABLE_HEADROOM16];
+    uint8_t *table_rV[256*256 + 2*YUVRGB_TABLE_HEADROOM16];
+    uint8_t *table_gU[256*256 + 2*YUVRGB_TABLE_HEADROOM16];
+    uint8_t *table_bU[256*256 + 2*YUVRGB_TABLE_HEADROOM16];
     DECLARE_ALIGNED(16, int32_t, input_rgb2yuv_table)[16+40*4]; // This table can contain both C and SIMD formatted values, teh C vales are always at the XY_IDX points
 #define RY_IDX 0
 #define GY_IDX 1
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 28de37e..dac64fb 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -54,60 +54,60 @@ const int *sws_getCoefficients(int colorspace)
 }
 
 #define LOADCHROMA(i)                               \
-    U = pu[i] >> shift;                             \
-    V = pv[i] >> shift;                             \
-    r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                     \
-    g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);  \
-    b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
+    U = pu[i];                                      \
+    V = pv[i];                                      \
+    r = (void *)c->table_rV[V+suffix(YUVRGB_TABLE_HEADROOM)]; \
+    g = (void *)(c->table_gU[U+suffix(YUVRGB_TABLE_HEADROOM)] + c->table_gV[V+suffix(YUVRGB_TABLE_HEADROOM)]); \
+    b = (void *)c->table_bU[U+suffix(YUVRGB_TABLE_HEADROOM)];
 
 #define PUTRGB(dst, src, i)                         \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y] + g[Y] + b[Y];            \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y] + g[Y] + b[Y];
 
 #define PUTRGB24(dst, src, i)                       \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[6 * i + 0] = r[Y];                          \
     dst[6 * i + 1] = g[Y];                          \
     dst[6 * i + 2] = b[Y];                          \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[6 * i + 3] = r[Y];                          \
     dst[6 * i + 4] = g[Y];                          \
     dst[6 * i + 5] = b[Y];
 
 #define PUTBGR24(dst, src, i)                       \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[6 * i + 0] = b[Y];                          \
     dst[6 * i + 1] = g[Y];                          \
     dst[6 * i + 2] = r[Y];                          \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[6 * i + 3] = b[Y];                          \
     dst[6 * i + 4] = g[Y];                          \
     dst[6 * i + 5] = r[Y];
 
 #define PUTRGBA(dst, ysrc, asrc, i, s)                                  \
-    Y              = ysrc[2 * i] >> shift;                              \
-    dst[2 * i]     = r[Y] + g[Y] + b[Y] + (asrc[2 * i]     >> shift << s); \
-    Y              = ysrc[2 * i + 1] >> shift;                          \
-    dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] >> shift << s);
+    Y              = ysrc[2 * i];                                       \
+    dst[2 * i]     = r[Y] + g[Y] + b[Y] + (asrc[2 * i]     << s);       \
+    Y              = ysrc[2 * i + 1];                                   \
+    dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] << s);
 
 #define PUTRGB48(dst, src, i)                       \
-    Y                = src[ 2 * i] >> shift;        \
+    Y                = src[ 2 * i];                 \
     dst[12 * i +  0] = dst[12 * i +  1] = r[Y];     \
     dst[12 * i +  2] = dst[12 * i +  3] = g[Y];     \
     dst[12 * i +  4] = dst[12 * i +  5] = b[Y];     \
-    Y                = src[ 2 * i + 1] >> shift;    \
+    Y                = src[ 2 * i + 1];             \
     dst[12 * i +  6] = dst[12 * i +  7] = r[Y];     \
     dst[12 * i +  8] = dst[12 * i +  9] = g[Y];     \
     dst[12 * i + 10] = dst[12 * i + 11] = b[Y];
 
 #define PUTBGR48(dst, src, i)                       \
-    Y                = src[2 * i] >> shift;         \
+    Y                = src[2 * i];                  \
     dst[12 * i +  0] = dst[12 * i +  1] = b[Y];     \
     dst[12 * i +  2] = dst[12 * i +  3] = g[Y];     \
     dst[12 * i +  4] = dst[12 * i +  5] = r[Y];     \
-    Y                = src[2  * i +  1] >> shift;   \
+    Y                = src[2  * i +  1];            \
     dst[12 * i +  6] = dst[12 * i +  7] = b[Y];     \
     dst[12 * i +  8] = dst[12 * i +  9] = g[Y];     \
     dst[12 * i + 10] = dst[12 * i + 11] = r[Y];
@@ -164,35 +164,15 @@ const int *sws_getCoefficients(int colorspace)
     ENDYUV2RGBFUNC()
 
 #define src_type const uint8_t
-#define shift 0
 #define suffix(a) a
 #include "yuv2rgb_template.c"
 #undef src_type
-#undef shift
 #undef suffix
 
 #define src_type const uint16_t
-#define shift 1
-#define suffix(a) a##9
-#include "yuv2rgb_template.c"
-#undef src_type
-#undef shift
-#undef suffix
-
-#define src_type const uint16_t
-#define shift 2
-#define suffix(a) a##10
-#include "yuv2rgb_template.c"
-#undef src_type
-#undef shift
-#undef suffix
-
-#define src_type const uint16_t
-#define shift 8
 #define suffix(a) a##16
 #include "yuv2rgb_template.c"
 #undef src_type
-#undef shift
 #undef suffix
 
 SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
@@ -216,8 +196,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
            "No accelerated colorspace conversion found from %s to %s.\n",
            av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
 
-#define SELECT(n) \
-    (bits == 16 ? n##16 : bits == 10 ? n##10 : bits == 9 ? n##9 : n)
+#define SELECT(n) (bits > 8 ? n##16 : n)
 
     switch (c->dstFormat) {
     case AV_PIX_FMT_BGR48BE:
@@ -261,27 +240,31 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
     return NULL;
 }
 
-static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
+static void fill_table(uint8_t **table, int bits, const int elemsize,
                        const int64_t inc, void *y_tab)
 {
     int i;
     uint8_t *y_table = y_tab;
+    int headroom = bits > 8 ? YUVRGB_TABLE_HEADROOM16 : YUVRGB_TABLE_HEADROOM;
+    int count = 1 << bits;
 
-    y_table -= elemsize * (inc >> 9);
+    y_table -= elemsize * (inc >> (17 - bits));
 
-    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
-        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
+    for (i = 0; i < count + 2*headroom; i++) {
+        int64_t cb = av_clip(i-headroom, 0, count-1)*inc;
         table[i] = y_table + elemsize * (cb >> 16);
     }
 }
 
-static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc)
+static void fill_gv_table(int *table, int bits, const int elemsize, const int64_t inc)
 {
     int i;
-    int off    = -(inc >> 9);
+    int off    = -(inc >> (17 - bits));
+    int headroom = bits > 8 ? YUVRGB_TABLE_HEADROOM16 : YUVRGB_TABLE_HEADROOM;
+    int count = 1 << bits;
 
-    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
-        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
+    for (i = 0; i < count + 2*headroom; i++) {
+        int64_t cb = av_clip(i-headroom, 0, count-1)*inc;
         table[i] = elemsize * (off + (cb >> 16));
     }
 }
@@ -302,6 +285,8 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
                                      int fullRange, int brightness,
                                      int contrast, int saturation)
 {
+    int bits = av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1 + 1;
+    int table_scale = (1 << bits) >> 8;
     const int isRgb = c->dstFormat == AV_PIX_FMT_RGB32     ||
                       c->dstFormat == AV_PIX_FMT_RGB32_1   ||
                       c->dstFormat == AV_PIX_FMT_BGR24     ||
@@ -326,7 +311,7 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
     uint16_t *y_table16;
     uint32_t *y_table32;
     int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
-    const int yoffs = fullRange ? 384 : 326;
+    int yoffs = fullRange ? 384 : 326;
 
     int64_t crv =  inv_table[0];
     int64_t cbu =  inv_table[1];
@@ -375,117 +360,124 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
     cgu = ((cgu << 16) + 0x8000) / FFMAX(cy, 1);
     cgv = ((cgv << 16) + 0x8000) / FFMAX(cy, 1);
 
+    yoffs *= table_scale;
+    cy    /= table_scale;
+    crv   /= table_scale;
+    cbu   /= table_scale;
+    cgu   /= table_scale;
+    cgv   /= table_scale;
+
     av_freep(&c->yuvTable);
 
     switch (bpp) {
     case 1:
-        c->yuvTable = av_malloc(1024);
+        c->yuvTable = av_malloc(table_scale * 1024);
         y_table     = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024 - 110; i++) {
-            y_table[i + 110]  = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
+        for (i = 0; i < table_scale * (1024 - 110); i++) {
+            y_table[i + table_scale * 110]  = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
             yb               += cy;
         }
-        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
-        fill_gv_table(c->table_gV, 1, cgv);
+        fill_table(c->table_gU, bits, 1, cgu, y_table + yoffs);
+        fill_gv_table(c->table_gV, bits, 1, cgv);
         break;
     case 4:
     case 4 | 128:
         rbase       = isRgb ? 3 : 0;
         gbase       = 1;
         bbase       = isRgb ? 0 : 3;
-        c->yuvTable = av_malloc(1024 * 3);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3);
         y_table     = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024 - 110; i++) {
+        for (i = 0; i < table_scale * (1024 - 110); i++) {
             int yval                = av_clip_uint8((yb + 0x8000) >> 16);
-            y_table[i + 110]        = (yval >> 7)        << rbase;
-            y_table[i +  37 + 1024] = ((yval + 43) / 85) << gbase;
-            y_table[i + 110 + 2048] = (yval >> 7)        << bbase;
+            y_table[i + table_scale * 110]        = (yval >> 7)        << rbase;
+            y_table[i + table_scale *  37 + table_scale * 1024] = ((yval + 43) / 85) << gbase;
+            y_table[i + table_scale * 110 + table_scale * 2048] = (yval >> 7)        << bbase;
             yb += cy;
         }
-        fill_table(c->table_rV, 1, crv, y_table + yoffs);
-        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
-        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
-        fill_gv_table(c->table_gV, 1, cgv);
+        fill_table(c->table_rV, bits, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, bits, 1, cgu, y_table + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 1, cbu, y_table + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 1, cgv);
         break;
     case 8:
         rbase       = isRgb ? 5 : 0;
         gbase       = isRgb ? 2 : 3;
         bbase       = isRgb ? 0 : 6;
-        c->yuvTable = av_malloc(1024 * 3);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3);
         y_table     = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024 - 38; i++) {
+        for (i = 0; i < table_scale * 1024 - table_scale * 38; i++) {
             int yval               = av_clip_uint8((yb + 0x8000) >> 16);
-            y_table[i + 16]        = ((yval + 18) / 36) << rbase;
-            y_table[i + 16 + 1024] = ((yval + 18) / 36) << gbase;
-            y_table[i + 37 + 2048] = ((yval + 43) / 85) << bbase;
+            y_table[i + table_scale * 16]        = ((yval + 18) / 36) << rbase;
+            y_table[i + table_scale * 16 + table_scale * 1024] = ((yval + 18) / 36) << gbase;
+            y_table[i + table_scale * 37 + table_scale * 2048] = ((yval + 43) / 85) << bbase;
             yb += cy;
         }
-        fill_table(c->table_rV, 1, crv, y_table + yoffs);
-        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
-        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
-        fill_gv_table(c->table_gV, 1, cgv);
+        fill_table(c->table_rV, bits, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, bits, 1, cgu, y_table + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 1, cbu, y_table + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 1, cgv);
         break;
     case 12:
         rbase       = isRgb ? 8 : 0;
         gbase       = 4;
         bbase       = isRgb ? 0 : 8;
-        c->yuvTable = av_malloc(1024 * 3 * 2);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3 * 2);
         y_table16   = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024; i++) {
+        for (i = 0; i < table_scale * 1024; i++) {
             uint8_t yval        = av_clip_uint8((yb + 0x8000) >> 16);
             y_table16[i]        = (yval >> 4) << rbase;
-            y_table16[i + 1024] = (yval >> 4) << gbase;
-            y_table16[i + 2048] = (yval >> 4) << bbase;
+            y_table16[i + table_scale * 1024] = (yval >> 4) << gbase;
+            y_table16[i + table_scale * 2048] = (yval >> 4) << bbase;
             yb += cy;
         }
         if (isNotNe)
-            for (i = 0; i < 1024 * 3; i++)
+            for (i = 0; i < table_scale * 1024 * 3; i++)
                 y_table16[i] = av_bswap16(y_table16[i]);
-        fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
-        fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
-        fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
-        fill_gv_table(c->table_gV, 2, cgv);
+        fill_table(c->table_rV, bits, 2, crv, y_table16 + yoffs);
+        fill_table(c->table_gU, bits, 2, cgu, y_table16 + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 2, cbu, y_table16 + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 2, cgv);
         break;
     case 15:
     case 16:
         rbase       = isRgb ? bpp - 5 : 0;
         gbase       = 5;
         bbase       = isRgb ? 0 : (bpp - 5);
-        c->yuvTable = av_malloc(1024 * 3 * 2);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3 * 2);
         y_table16   = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024; i++) {
+        for (i = 0; i < table_scale * 1024; i++) {
             uint8_t yval        = av_clip_uint8((yb + 0x8000) >> 16);
             y_table16[i]        = (yval >> 3)          << rbase;
-            y_table16[i + 1024] = (yval >> (18 - bpp)) << gbase;
-            y_table16[i + 2048] = (yval >> 3)          << bbase;
+            y_table16[i + table_scale * 1024] = (yval >> (18 - bpp)) << gbase;
+            y_table16[i + table_scale * 2048] = (yval >> 3)          << bbase;
             yb += cy;
         }
         if (isNotNe)
-            for (i = 0; i < 1024 * 3; i++)
+            for (i = 0; i < table_scale * 1024 * 3; i++)
                 y_table16[i] = av_bswap16(y_table16[i]);
-        fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
-        fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
-        fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
-        fill_gv_table(c->table_gV, 2, cgv);
+        fill_table(c->table_rV, bits, 2, crv, y_table16 + yoffs);
+        fill_table(c->table_gU, bits, 2, cgu, y_table16 + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 2, cbu, y_table16 + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 2, cgv);
         break;
     case 24:
     case 48:
-        c->yuvTable = av_malloc(1024);
+        c->yuvTable = av_malloc(table_scale * 1024);
         y_table     = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024; i++) {
+        for (i = 0; i < table_scale * 1024; i++) {
             y_table[i]  = av_clip_uint8((yb + 0x8000) >> 16);
             yb         += cy;
         }
-        fill_table(c->table_rV, 1, crv, y_table + yoffs);
-        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
-        fill_table(c->table_bU, 1, cbu, y_table + yoffs);
-        fill_gv_table(c->table_gV, 1, cgv);
+        fill_table(c->table_rV, bits, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, bits, 1, cgu, y_table + yoffs);
+        fill_table(c->table_bU, bits, 1, cbu, y_table + yoffs);
+        fill_gv_table(c->table_gV, bits, 1, cgv);
         break;
     case 32:
     case 64:
@@ -497,21 +489,21 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
         needAlpha = CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat);
         if (!needAlpha)
             abase = (base + 24) & 31;
-        c->yuvTable = av_malloc(1024 * 3 * 4);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3 * 4);
         y_table32   = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024; i++) {
+        for (i = 0; i < table_scale * 1024; i++) {
             unsigned yval       = av_clip_uint8((yb + 0x8000) >> 16);
             y_table32[i]        = (yval << rbase) +
                                   (needAlpha ? 0 : (255u << abase));
-            y_table32[i + 1024] =  yval << gbase;
-            y_table32[i + 2048] =  yval << bbase;
+            y_table32[i + table_scale * 1024] =  yval << gbase;
+            y_table32[i + table_scale * 2048] =  yval << bbase;
             yb += cy;
         }
-        fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
-        fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + 1024);
-        fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2048);
-        fill_gv_table(c->table_gV, 4, cgv);
+        fill_table(c->table_rV, bits, 4, crv, y_table32 + yoffs);
+        fill_table(c->table_gU, bits, 4, cgu, y_table32 + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 4, cbu, y_table32 + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 4, cgv);
         break;
     default:
         if(!isPlanar(c->dstFormat) || bpp <= 24)
diff --git a/libswscale/yuv2rgb_template.c b/libswscale/yuv2rgb_template.c
index e3ca8ba..0034734 100644
--- a/libswscale/yuv2rgb_template.c
+++ b/libswscale/yuv2rgb_template.c
@@ -248,11 +248,11 @@ YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0)
     const uint8_t *f16 = ff_dither_2x2_8[(y & 1)^1];
 
 #define PUTRGB16(dst, src, i, o)                    \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d16[0 + o]] +            \
                      g[Y + e16[0 + o]] +            \
                      b[Y + f16[0 + o]];             \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
                      g[Y + e16[1 + o]] +            \
                      b[Y + f16[1 + o]];
@@ -278,11 +278,11 @@ YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0)
     const uint8_t *e16 = ff_dither_2x2_8[(y & 1)^1];
 
 #define PUTRGB15(dst, src, i, o)                    \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d16[0 + o]] +            \
                      g[Y + d16[1 + o]] +            \
                      b[Y + e16[0 + o]];             \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
                      g[Y + d16[0 + o]] +            \
                      b[Y + e16[1 + o]];
@@ -308,11 +308,11 @@ YUV2RGBFUNC(yuv2rgb_c_12_ordered_dither, uint16_t, 0)
     const uint8_t *d16 = ff_dither_4x4_16[y & 3];
 
 #define PUTRGB12(dst, src, i, o)                    \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d16[0 + o]] +            \
                      g[Y + d16[0 + o]] +            \
                      b[Y + d16[0 + o]];             \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
                      g[Y + d16[1 + o]] +            \
                      b[Y + d16[1 + o]];
@@ -340,11 +340,11 @@ YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t, 0)
     const uint8_t *d64 = ff_dither_8x8_73[y & 7];
 
 #define PUTRGB8(dst, src, i, o)                     \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d32[0 + o]] +            \
                      g[Y + d32[0 + o]] +            \
                      b[Y + d64[0 + o]];             \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d32[1 + o]] +            \
                      g[Y + d32[1 + o]] +            \
                      b[Y + d64[1 + o]];
@@ -372,11 +372,11 @@ YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t, 0)
     int acc;
 
 #define PUTRGB4D(dst, src, i, o)                    \
-    Y      = src[2 * i] >> shift;                   \
+    Y      = src[2 * i];                            \
     acc    = r[Y + d128[0 + o]] +                   \
              g[Y +  d64[0 + o]] +                   \
              b[Y + d128[0 + o]];                    \
-    Y      = src[2 * i + 1] >> shift;               \
+    Y      = src[2 * i + 1];                        \
     acc   |= (r[Y + d128[1 + o]] +                  \
               g[Y +  d64[1 + o]] +                  \
               b[Y + d128[1 + o]]) << 4;             \
@@ -404,11 +404,11 @@ YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t, 0)
     const uint8_t *d128 = ff_dither_8x8_220[y & 7];
 
 #define PUTRGB4DB(dst, src, i, o)                   \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d128[0 + o]] +           \
                      g[Y +  d64[0 + o]] +           \
                      b[Y + d128[0 + o]];            \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d128[1 + o]] +           \
                      g[Y +  d64[1 + o]] +           \
                      b[Y + d128[1 + o]];
@@ -433,12 +433,12 @@ CLOSEYUV2RGBFUNC(8)
 YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
     const uint8_t *d128 = ff_dither_8x8_220[y & 7];
     char out_1 = 0, out_2 = 0;
-    g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
+    g = c->table_gU[128 + suffix(YUVRGB_TABLE_HEADROOM)] + c->table_gV[128 + suffix(YUVRGB_TABLE_HEADROOM)];
 
 #define PUTRGB1(out, src, i, o)                     \
-    Y    = src[2 * i] >> shift;                     \
+    Y    = src[2 * i];                              \
     out += out + g[Y + d128[0 + o]];                \
-    Y    = src[2 * i + 1] >> shift;                 \
+    Y    = src[2 * i + 1];                          \
     out += out + g[Y + d128[1 + o]];
 
     PUTRGB1(out_1, py_1, 0, 0);
-- 
1.8.4.2