[FFmpeg-devel] patch: the fastest video decoder ever? :)

u-9iep at aetey.se u-9iep at aetey.se
Fri Jan 6 15:13:41 EET 2017


Hello,

On slow hardware a 16-bit framebuffer depth makes a remarkable difference
in performance and still provides a good look.

When videos are to be played on slow terminals there is a single
best speed performer, cinepak (tell me if you know a better codec in this
respect, the only comparable one I know of is VQA-15, but there is no
reasonable encoder for it, nor a safe decoder).

Unfortunately cinepak as present in ffmpeg yields rgb24 which has to be
repacked depending on the framebuffer format.

The attached patch makes cinepak to provide rgb565 in native endianness.
This cuts in half (!) the mplayer CPU consumption on i686 with Xorg
in the corresponding mode/depth, compared to decoding to rgb24.

The optimization is possible because cinepak is a VC codec and pixel
format packing at codebook fill is much more efficient than at a later
stage. (Thanks Michael for the suggestion, long ago).

Of course the picture quality is slightly affected if this decoder version
is used with larger framebuffer depths. A run-time switch would be to
prefer, if this optimization against all odds would be welcome upstream :)

I am posting this as a proof of concept, lacking run-time format
switching. As said, this patch yields half CPU consumption or double
speed at decoding for rgb565 terminals.

Regards,
Rune
-------------- next part --------------
--- libavcodec/cinepak.c.rgb24	2017-01-05 14:43:53.379965430 +0100
+++ libavcodec/cinepak.c	2017-01-05 21:54:04.333090225 +0100
@@ -30,7 +30,9 @@
  *   http://wiki.multimedia.cx/index.php?title=Sega_FILM
  *
  * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB
+ * Cinepak rgb565 format (c) 2017 Rl, Aetey Global Technologies AB
  * @author Cinepak colorspace, Rl, Aetey Global Technologies AB
+ * @author Cinepak rgb565, Rl, Aetey Global Technologies AB
  */
 
 #include <stdio.h>
@@ -42,8 +44,12 @@
 #include "avcodec.h"
 #include "internal.h"
 
+/* feel free to replace with a better mapping implementation
+ * (keeping in mind slow, not very "intelligent" hardware)
+ */
+#define PACK_RGB_RGB565(r,g,b) ((((r)>>3)<<11)|(((g)>>2)<<5)|(b>>3))
 
-typedef uint8_t cvid_codebook[12];
+typedef uint16_t cvid_codebook[4]; /* in the native endian rgb565 format */
 
 #define MAX_STRIPS      32
 
@@ -73,13 +79,13 @@
     uint32_t pal[256];
 } CinepakContext;
 
-static void cinepak_decode_codebook (cvid_codebook *codebook,
+static void cinepak_decode_codebook (cvid_codebook *codebook, int palette_video,
                                      int chunk_id, int size, const uint8_t *data)
 {
     const uint8_t *eod = (data + size);
     uint32_t flag, mask;
     int      i, n;
-    uint8_t *p;
+    uint16_t *p;
 
     /* check if this chunk contains 4- or 6-element vectors */
     n    = (chunk_id & 0x04) ? 4 : 6;
@@ -98,33 +104,36 @@
         }
 
         if (!(chunk_id & 0x01) || (flag & mask)) {
-            int k, kk;
+            int k;
 
             if ((data + n) > eod)
                 break;
 
-            for (k = 0; k < 4; ++k) {
-                int r = *data++;
-                for (kk = 0; kk < 3; ++kk)
-                    *p++ = r;
-            }
-            if (n == 6) {
-                int r, g, b, u, v;
+            if (n == 4)
+/* 8-bit palette indexes or otherwise grayscale values,
+ * they need different handling */
+                if (palette_video)
+                    for (k = 0; k < 4; ++k)
+                        *p++ = *data++;
+                else
+                    for (k = 0; k < 4; ++k) {
+                        int r = *data++;
+                        *p++ = PACK_RGB_RGB565(r,r,r);
+                    }
+            else {
+                int y[4], u, v;
+                for (k = 0; k < 4; ++k)
+/* 8-bit luminance values */
+                    y[k] = *data++;
                 u = *(int8_t *)data++;
                 v = *(int8_t *)data++;
-                p -= 12;
-                for(k=0; k<4; ++k) {
-                    r = *p++ + v*2;
-                    g = *p++ - (u/2) - v;
-                    b = *p   + u*2;
-                    p -= 2;
-                    *p++ = av_clip_uint8(r);
-                    *p++ = av_clip_uint8(g);
-                    *p++ = av_clip_uint8(b);
-                }
+                for(k=0; k<4; ++k)
+                    *p++ = PACK_RGB_RGB565(av_clip_uint8(y[k] + v*2),
+                                           av_clip_uint8(y[k] - (u/2) - v),
+                                           av_clip_uint8(y[k] + u*2));
             }
         } else {
-            p += 12;
+            p += 4;
         }
     }
 }
@@ -134,8 +143,8 @@
 {
     const uint8_t   *eod = (data + size);
     uint32_t         flag, mask;
-    uint8_t         *cb0, *cb1, *cb2, *cb3;
-    int             x, y;
+    uint16_t        *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
     char            *ip0, *ip1, *ip2, *ip3;
 
     flag = 0;
@@ -145,7 +154,7 @@
 
 /* take care of y dimension not being multiple of 4, such streams exist */
         ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
-          (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]);
+          (s->palette_video?strip->x1:strip->x1*2) + (y * s->frame->linesize[0]);
         if(s->avctx->height - y > 1) {
             ip1 = ip0 + s->frame->linesize[0];
             if(s->avctx->height - y > 2) {
@@ -181,29 +190,25 @@
                 }
 
                 if ((chunk_id & 0x02) || (~flag & mask)) {
-                    uint8_t *p;
+                    uint16_t *p;
                     if (data >= eod)
                         return AVERROR_INVALIDDATA;
 
                     p = strip->v1_codebook[*data++];
                     if (s->palette_video) {
-                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6];
-                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9];
+                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2];
+                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3];
                         ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
-                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3];
+                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1];
                     } else {
-                        p += 6;
-                        memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
-                        memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
-                        p += 3; /* ... + 9 */
-                        memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
-                        memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
-                        p -= 9; /* ... + 0 */
-                        memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
-                        memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
-                        p += 3; /* ... + 3 */
-                        memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
-                        memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
+                        * (uint16_t *)ip3    = *((uint16_t *)ip3+1) =
+                        * (uint16_t *)ip2    = *((uint16_t *)ip2+1) = p[2];
+                        *((uint16_t *)ip3+2) = *((uint16_t *)ip3+3) =
+                        *((uint16_t *)ip2+2) = *((uint16_t *)ip2+3) = p[3];
+                        * (uint16_t *)ip1    = *((uint16_t *)ip1+1) =
+                        * (uint16_t *)ip0    = *((uint16_t *)ip0+1) = p[0];
+                        *((uint16_t *)ip1+2) = *((uint16_t *)ip1+3) =
+                        *((uint16_t *)ip0+2) = *((uint16_t *)ip0+3) = p[1];
                     }
 
                 } else if (flag & mask) {
@@ -217,34 +222,34 @@
                     if (s->palette_video) {
                         uint8_t *p;
                         p = ip3;
-                        *p++ = cb2[6];
-                        *p++ = cb2[9];
-                        *p++ = cb3[6];
-                        *p   = cb3[9];
+                        *p++ = cb2[2];
+                        *p++ = cb2[3];
+                        *p++ = cb3[2];
+                        *p   = cb3[3];
                         p = ip2;
                         *p++ = cb2[0];
-                        *p++ = cb2[3];
+                        *p++ = cb2[1];
                         *p++ = cb3[0];
-                        *p   = cb3[3];
+                        *p   = cb3[1];
                         p = ip1;
-                        *p++ = cb0[6];
-                        *p++ = cb0[9];
-                        *p++ = cb1[6];
-                        *p   = cb1[9];
+                        *p++ = cb0[2];
+                        *p++ = cb0[3];
+                        *p++ = cb1[2];
+                        *p   = cb1[3];
                         p = ip0;
                         *p++ = cb0[0];
-                        *p++ = cb0[3];
+                        *p++ = cb0[1];
                         *p++ = cb1[0];
-                        *p   = cb1[3];
+                        *p   = cb1[1];
                     } else {
-                        memcpy(ip3 + 0, cb2 + 6, 6);
-                        memcpy(ip3 + 6, cb3 + 6, 6);
-                        memcpy(ip2 + 0, cb2 + 0, 6);
-                        memcpy(ip2 + 6, cb3 + 0, 6);
-                        memcpy(ip1 + 0, cb0 + 6, 6);
-                        memcpy(ip1 + 6, cb1 + 6, 6);
-                        memcpy(ip0 + 0, cb0 + 0, 6);
-                        memcpy(ip0 + 6, cb1 + 0, 6);
+                        memcpy(ip3 + 0, cb2 + 2, 4);
+                        memcpy(ip3 + 4, cb3 + 2, 4);
+                        memcpy(ip2 + 0, cb2 + 0, 4);
+                        memcpy(ip2 + 4, cb3 + 0, 4);
+                        memcpy(ip1 + 0, cb0 + 2, 4);
+                        memcpy(ip1 + 4, cb1 + 2, 4);
+                        memcpy(ip0 + 0, cb0 + 0, 4);
+                        memcpy(ip0 + 4, cb1 + 0, 4);
                     }
 
                 }
@@ -254,8 +259,8 @@
                 ip0 += 4;  ip1 += 4;
                 ip2 += 4;  ip3 += 4;
             } else {
-                ip0 += 12;  ip1 += 12;
-                ip2 += 12;  ip3 += 12;
+                ip0 += 8;  ip1 += 8;
+                ip2 += 8;  ip3 += 8;
             }
         }
     }
@@ -290,16 +295,16 @@
         case 0x21:
         case 0x24:
         case 0x25:
-            cinepak_decode_codebook (strip->v4_codebook, chunk_id,
-                chunk_size, data);
+            cinepak_decode_codebook (strip->v4_codebook, s->palette_video,
+                chunk_id, chunk_size, data);
             break;
 
         case 0x22:
         case 0x23:
         case 0x26:
         case 0x27:
-            cinepak_decode_codebook (strip->v1_codebook, chunk_id,
-                chunk_size, data);
+            cinepak_decode_codebook (strip->v1_codebook, s->palette_video,
+                chunk_id, chunk_size, data);
             break;
 
         case 0x30:
@@ -412,10 +417,16 @@
 
     s->sega_film_skip_bytes = -1;  /* uninitialized state */
 
-    // check for paletted data
+    /* check for paletted data */
     if (avctx->bits_per_coded_sample != 8) {
         s->palette_video = 0;
-        avctx->pix_fmt = AV_PIX_FMT_RGB24;
+/* we choose to produce video data in a certain 16-bit format,
+ * for the sake of video output on slow hardware with this native format
+ * FIXME: this should be made selectable via an option, not hardcoded,
+ * FIXME: to make the same binary usable with different hardware,
+ * FIXME: for the moment we make this a compile-time choice by using
+ * FIXME: a certain version of the cinepak.c file */
+        avctx->pix_fmt = AV_PIX_FMT_RGB565;
     } else {
         s->palette_video = 1;
         avctx->pix_fmt = AV_PIX_FMT_PAL8;


More information about the ffmpeg-devel mailing list