[FFmpeg-devel] deduplicated [PATCH] Cinepak: speed up decoding several-fold, depending on the scenario, by supporting multiple output pixel formats.

Sat Feb 11 23:25:03 EET 2017

Hello,

This is my best effort attempt to make the patch acceptable
by the upstream's criteria.

Daniel, do you mind that I referred to your message in the commit?
I believe is is best to indicate numbers from a third party measurement.

The code seems to be equvalent to the previous patch,
with about 20% less LOC.

This hurts readability (my subjective impression) but on the positive side
the change makes the structure of the code more explicit.

Attaching the patch.

Now I have done what I can, have to leave.
Unless there are bugs there in the patch, my attempt to contribute ends
at this point.

Thanks to everyone who cared to objectively discuss a specific case
of ffmpeg usage, the implications of techniques around VQ and whether/why
some non-traditional approaches can make sense.

Good luck to the ffmpeg project, it is very useful and valuable.

Best regards,
Rune
-------------- next part --------------
>From 0c9badec5d144b995c0bb52c7a80939b672be3f5 Mon Sep 17 00:00:00 2001
From: Rl <addr-see-the-website at aetey.se>
Date: Sat, 11 Feb 2017 20:28:54 +0100
Subject: [PATCH] Cinepak: speed up decoding several-fold, depending on the
 scenario, by supporting multiple output pixel formats.

Decoding to rgb24 and pal8 is optimized.

Added rgb32, rgb565, yuv420p, each with faster decoding than to rgb24.

The most noticeable gain is achieved by the created possibility
to skip format conversions, for example when decoding to rgb565
----
Using matrixbench_mpeg2.mpg (720x567) encoded with ffmpeg into Cinepak
using default settings, decoding on an i5 3570K, 3.4 GHz:

bicubic (default):          ~24x realtime
fast_bilinear:              ~65x realtime
patch w/rgb565 override:    ~154x realtime
----
(https://ffmpeg.org/pipermail/ffmpeg-devel/2017-February/206799.html)

palettized input can be decoded to any of the output formats,
pal8 output is still limited to palettized input

with input other than palettized/grayscale
yuv420 is approximated by the Cinepak colorspace

The output format can be chosen at runtime by an option or via the API.
---
 libavcodec/cinepak.c | 844 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 692 insertions(+), 152 deletions(-)

diff --git a/libavcodec/cinepak.c b/libavcodec/cinepak.c
index d657e9c0c1..7b08e20e06 100644
--- a/libavcodec/cinepak.c
+++ b/libavcodec/cinepak.c
@@ -31,6 +31,8 @@
  *
  * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB
  * @author Cinepak colorspace, Rl, Aetey Global Technologies AB
+ * Extra output formats / optimizations (c) 2017 Rl, Aetey Global Technologies AB
+ * @author Extra output formats / optimizations, Rl, Aetey Global Technologies AB
  */
 
 #include <stdio.h>
@@ -39,23 +41,48 @@
 
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+/* #include "libavutil/avassert.h" */
 #include "avcodec.h"
 #include "internal.h"
 
+/* rounding to nearest; truncation would be slightly faster
+ * but it noticeably affects the picture quality;
+ * unless we become extremely desperate to use every single cycle
+ * we do not bother implementing a choice -- rl */
+#define PACK_RGB_RGB565(r,g,b) (((av_clip_uint8((r)+4)>>3)<<11)|((av_clip_uint8((g)+2)>>2)<<5)|(av_clip_uint8((b)+4)>>3))
 
-typedef uint8_t cvid_codebook[12];
+/*
+ * more "desperate/ultimate" optimization possibilites:
+ * - possibly (hardly?) spare a cycle or two by not ensuring to stay
+ *   inside the frame at vector decoding (the frame is allocated with
+ *   a margin for us as an extra precaution, we can as well use this)
+ * - skip filling in opacity when it is not needed by the data consumer,
+ *   in many cases rgb32 is almost as fast as rgb565, with full quality,
+ *   improving its speed can make sense
+ */
+
+typedef union cvid_codebook {
+    uint32_t  rgb32[256][ 4];
+    uint8_t   rgb24[256][12];
+    uint16_t rgb565[256][ 4];
+    uint8_t  yuv420[256][ 6];
+    uint8_t    pal8[256][ 4];
+} cvid_codebook;
 
-#define MAX_STRIPS      32
+#define MAX_STRIPS      32    /* an arbitrary limit -- rl */
 
 typedef struct cvid_strip {
     uint16_t          id;
     uint16_t          x1, y1;
     uint16_t          x2, y2;
-    cvid_codebook     v4_codebook[256];
-    cvid_codebook     v1_codebook[256];
+    cvid_codebook     v4_codebook;
+    cvid_codebook     v1_codebook;
 } cvid_strip;
 
-typedef struct CinepakContext {
+typedef struct CinepakContext CinepakContext;
+struct CinepakContext {
+    const AVClass *class;
 
     AVCodecContext *avctx;
     AVFrame *frame;
@@ -71,57 +98,192 @@ typedef struct CinepakContext {
     int sega_film_skip_bytes;
 
     uint32_t pal[256];
-} CinepakContext;
 
-static void cinepak_decode_codebook (cvid_codebook *codebook,
-                                     int chunk_id, int size, const uint8_t *data)
-{
-    const uint8_t *eod = (data + size);
-    uint32_t flag, mask;
-    int      i, n;
-    uint8_t *p;
-
-    /* check if this chunk contains 4- or 6-element vectors */
-    n    = (chunk_id & 0x04) ? 4 : 6;
-    flag = 0;
-    mask = 0;
-
-    p = codebook[0];
-    for (i=0; i < 256; i++) {
-        if ((chunk_id & 0x01) && !(mask >>= 1)) {
-            if ((data + 4) > eod)
-                break;
-
-            flag  = AV_RB32 (data);
-            data += 4;
-            mask  = 0x80000000;
-        }
+    void (*decode_codebook)(CinepakContext *s,
+                            cvid_codebook *codebook, int chunk_id,
+                            int size, const uint8_t *data);
+    int  (*decode_vectors)(CinepakContext *s, cvid_strip *strip,
+                           int chunk_id, int size, const uint8_t *data);
+/* options */
+    enum AVPixelFormat out_pixfmt;
+};
 
-        if (!(chunk_id & 0x01) || (flag & mask)) {
-            int k, kk;
+#define OFFSET(x) offsetof(CinepakContext, x)
+#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+{"output_pixel_format", "set output pixel format: rgb24/rgb32/rgb565/yuv420p/pal8; yuv420p is approximate", OFFSET(out_pixfmt), AV_OPT_TYPE_PIXEL_FMT, {.i64=AV_PIX_FMT_NONE}, -1, INT_MAX, VD },
+    { NULL },
+};
 
-            if ((data + n) > eod)
-                break;
+static const AVClass cinepak_class = {
+    .class_name = "cinepak decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
-            for (k = 0; k < 4; ++k) {
-                int r = *data++;
-                for (kk = 0; kk < 3; ++kk)
-                    *p++ = r;
+/* this is an attempt to satisfy the requirement to reduce code duplication
+ * feel free to do this in a more elegant fashion, but keep the speed
+ * -- rl */
+#define CODEBOOK_HEAD \
+    const uint8_t *eod;\
+    uint32_t flag, mask;\
+    int      i, n;\
+    int palette_video;\
+    int selective_update;\
+
+#define CODEBOOK_STREAM_PARSING \
+    for (i=0; i < 256; i++) {\
+        if (selective_update && !(mask >>= 1)) {\
+            if ((data + 4) > eod)\
+                break;\
+\
+            flag  = AV_RB32 (data);\
+            data += 4;\
+            mask  = 0x80000000;\
+        }\
+\
+        if (!selective_update || (flag & mask)) {\
+            int k;\
+\
+            if ((data + n) > eod)\
+                break;\
+
+#define CODEBOOK_INTRO \
+    selective_update = (chunk_id & 0x01);\
+    eod = (data + size);\
+    flag = 0;\
+    mask = 0;\
+
+#define CODEBOOK_FULL_COLOR \
+    /* check if this chunk contains 4- or 6-element vectors */\
+    n    = (chunk_id & 0x04) ? 4 : 6;\
+    palette_video = s->palette_video;\
+    CODEBOOK_INTRO\
+    CODEBOOK_STREAM_PARSING\
+
+#define VECTOR_INTRO \
+    CODEBOOK_INTRO\
+    v1_only          = (chunk_id & 0x02);\
+\
+    for (y=strip->y1; y < strip->y2; y+=4) {\
+
+#define VECTOR_STREAM_PARSING \
+        for (x=strip->x1; x < strip->x2; x+=4) {\
+            if (selective_update && !(mask >>= 1)) {\
+                if ((data + 4) > eod)\
+                    return AVERROR_INVALIDDATA;\
+\
+                flag  = AV_RB32 (data);\
+                data += 4;\
+                mask  = 0x80000000;\
+            }\
+\
+            if (!selective_update || (flag & mask)) {\
+                if (!v1_only && !(mask >>= 1)) {\
+                    if ((data + 4) > eod)\
+                        return AVERROR_INVALIDDATA;\
+\
+                    flag  = AV_RB32 (data);\
+                    data += 4;\
+                    mask  = 0x80000000;\
+                }\
+\
+                if (v1_only || (~flag & mask)) {\
+                    POINTER_TYPE *p;\
+                    if (data >= eod)\
+                        return AVERROR_INVALIDDATA;\
+
+#define VECTOR_DO \
+/* take care of y dimension not being multiple of 4, such streams exist */\
+        if(s->avctx->height - y > 1) {\
+            ip1 = ip0 + s->frame->linesize[0];\
+            if(s->avctx->height - y > 2) {\
+                ip2 = ip1 + s->frame->linesize[0];\
+                if(s->avctx->height - y > 3) {\
+                    ip3 = ip2 + s->frame->linesize[0];\
+                }\
+            }\
+        }\
+/* to get the correct picture for not-multiple-of-4 cases let us fill each\
+ * block from the bottom up, thus possibly overwriting the bottommost line\
+ * more than once but ending with the correct data in place\
+ * (instead of in-loop checking) */\
+        VECTOR_STREAM_PARSING\
+
+static void cinepak_decode_codebook_rgb32 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    CODEBOOK_HEAD
+    uint32_t *p = codebook->rgb32[0];
+
+    CODEBOOK_FULL_COLOR
+
+            if (n == 4)
+                if (palette_video)
+                    for (k = 0; k < 4; ++k)
+                        *p++ = s->pal[*data++]; /* this is easy */
+                else
+                    for (k = 0; k < 4; ++k) {
+                        int r = *data++;
+/* in some situations we might not have to set opacity */
+                        *p++ = /**/ (255<<24)| /**/ (r<<16)|(r<<8)|r;
+                    }
+            else { /* n == 6 */
+                int y, u, v;
+                u = (int8_t)data[4];
+                v = (int8_t)data[5];
+                for(k=0; k<4; ++k) {
+                    y = *data++;
+/* in some situations we might not have to set opacity */
+                    *p++ = /**/ (255<<24)| /**/
+/* here the cinepak color space excels */
+                           (av_clip_uint8(y + v*2)<<16)|
+                           (av_clip_uint8(y - (u/2) - v)<<8)|
+                            av_clip_uint8(y + u*2);
+                }
+                data += 2;
             }
-            if (n == 6) {
-                int r, g, b, u, v;
-                u = *(int8_t *)data++;
-                v = *(int8_t *)data++;
-                p -= 12;
+        } else {
+            p += 4;
+        }
+    }
+}
+
+static void cinepak_decode_codebook_rgb24 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    CODEBOOK_HEAD
+    uint8_t *p = codebook->rgb24[0];
+
+    CODEBOOK_FULL_COLOR
+
+            if (n == 4)
+                if (palette_video)
+                    for (k = 0; k < 4; ++k) {
+                        uint32_t r = s->pal[*data++];
+                        *p++ = (r>>16)&0xff;
+                        *p++ = (r>>8) &0xff;
+                        *p++ =  r     &0xff;
+                    }
+                else
+                    for (k = 0; k < 4; ++k) {
+                        int kk, r = *data++;
+                        for (kk = 0; kk < 3; ++kk)
+                            *p++ = r;
+                    }
+            else { /* n == 6 */
+                int y, u, v;
+                u = (int8_t)data[4];
+                v = (int8_t)data[5];
                 for(k=0; k<4; ++k) {
-                    r = *p++ + v*2;
-                    g = *p++ - (u/2) - v;
-                    b = *p   + u*2;
-                    p -= 2;
-                    *p++ = av_clip_uint8(r);
-                    *p++ = av_clip_uint8(g);
-                    *p++ = av_clip_uint8(b);
+                    y = *data++;
+/* here the cinepak color space excels */
+                    *p++ = av_clip_uint8(y + v*2);
+                    *p++ = av_clip_uint8(y - (u/2) - v);
+                    *p++ = av_clip_uint8(y + u*2);
                 }
+                data += 2;
             }
         } else {
             p += 12;
@@ -129,134 +291,448 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
     }
 }
 
-static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
+static void cinepak_decode_codebook_rgb565 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    CODEBOOK_HEAD
+    uint16_t *p = codebook->rgb565[0];
+
+    CODEBOOK_FULL_COLOR
+
+            if (n == 4)
+                if (palette_video)
+                    for (k = 0; k < 4; ++k) {
+                        uint32_t r = s->pal[*data++];
+                        *p++ = PACK_RGB_RGB565((r>>16)&0xff,
+                                               (r>>8)&0xff,
+                                                r&0xff);
+                    }
+                else
+                    for (k = 0; k < 4; ++k) {
+                        int r = *data++;
+                        *p++ = PACK_RGB_RGB565(r,r,r);
+                    }
+            else { /* n == 6 */
+                int y, u, v;
+                u = (int8_t)data[4];
+                v = (int8_t)data[5];
+                for(k=0; k<4; ++k) {
+                    y = *data++;
+/* here the cinepak color space excels */
+                    *p++ = PACK_RGB_RGB565(y + v*2,
+                                           y - (u/2) - v,
+                                           y + u*2);
+                }
+                data += 2;
+            }
+        } else {
+            p += 4;
+        }
+    }
+}
+
+/* a simplistic version to begin with, it is also fast -- rl */
+static void cinepak_decode_codebook_yuv420 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    CODEBOOK_HEAD
+    uint8_t *p = codebook->yuv420[0];
+
+    CODEBOOK_FULL_COLOR
+
+            if (n == 4)
+                if (palette_video) {
+/* here we have kind of "more" data than the output format can express */
+                    int r, g, b, u = 0, v = 0;
+                    for (k = 0; k < 4; ++k) {
+                        uint32_t rr = s->pal[*data++];
+                        r = (rr>>16)&0xff;
+                        g = (rr>>8) &0xff;
+                        b =  rr     &0xff;
+/* calculate the components (https://en.wikipedia.org/wiki/YUV) */
+                        *p++ = ((r*66+g*129+b*25+128)>>8)+16;
+                        u += (-r*38-g*74+b*112+128)>>8;
+                        v += (r*112-g*94-b*18+128)>>8;
+                    }
+                    *p++ = (u+2)/4+128;
+                    *p++ = (v+2)/4+128;
+                } else { /* grayscale, easy */
+                    for (k = 0; k < 4; ++k) {
+                        *p++ = *data++;
+                    }
+                    *p++ = 128;
+                    *p++ = 128;
+                }
+            else { /* n == 6 */
+/* here we'd have to handle double format conversion
+ * Cinepak=>rgb24 and then rgb24=>yuv420p, which can not be shortcut;
+ * for the moment just copying as-is, for simplicity and speed,
+ * color will be slightly off but not much */
+                *p++ = *data++;
+                *p++ = *data++;
+                *p++ = *data++;
+                *p++ = *data++;
+                *p++ = *data++ + 128;
+                *p++ = *data++ + 128;
+            }
+        } else {
+            p += 6;
+        }
+    }
+}
+
+/* here we do not expect anything besides palettized video,
+ * nor check the data for validity, which should be ok
+ * as long as we do not write beyond the bounds */
+static void cinepak_decode_codebook_pal8 (CinepakContext *s,
+    cvid_codebook *codebook, int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t *eod;
+    uint32_t flag, mask;
+    int      i;
+    int selective_update;
+    uint8_t *p = codebook->pal8[0];
+
+#define PAL8_VECTOR_LENGTH 4
+#define n PAL8_VECTOR_LENGTH
+/*    av_assert0(chunk_id & 0x04); */
+
+    CODEBOOK_INTRO
+    CODEBOOK_STREAM_PARSING
+
+#undef n
+
+            for (k = 0; k < 4; ++k)
+                *p++ = *data++;
+        } else {
+            p += 4;
+        }
+    }
+}
+
+static int cinepak_decode_vectors_rgb32 (CinepakContext *s, cvid_strip *strip,
+                                   int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t   *eod;
+    uint32_t         flag, mask;
+    uint32_t         *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
+    int selective_update;
+    int v1_only;
+
+    VECTOR_INTRO
+
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+                                strip->x1*4 + y*s->frame->linesize[0];
+#define POINTER_TYPE uint32_t
+    VECTOR_DO
+#undef POINTER_TYPE
+
+                    p = strip->v1_codebook.rgb32[*data++] + 2; /* ... + 8 */
+                    memcpy(ip3 + 0, p, 4); memcpy(ip3 + 4, p, 4);
+                    memcpy(ip2 + 0, p, 4); memcpy(ip2 + 4, p, 4);
+                    p += 1; /* ... + 12 */
+                    memcpy(ip3 + 8, p, 4); memcpy(ip3 + 12, p, 4);
+                    memcpy(ip2 + 8, p, 4); memcpy(ip2 + 12, p, 4);
+                    p -= 3; /* ... + 0 */
+                    memcpy(ip1 + 0, p, 4); memcpy(ip1 + 4, p, 4);
+                    memcpy(ip0 + 0, p, 4); memcpy(ip0 + 4, p, 4);
+                    p += 1; /* ... + 4 */
+                    memcpy(ip1 + 8, p, 4); memcpy(ip1 + 12, p, 4);
+                    memcpy(ip0 + 8, p, 4); memcpy(ip0 + 12, p, 4);
+
+                } else if (flag & mask) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    cb0 = strip->v4_codebook.rgb32[*data++];
+                    cb1 = strip->v4_codebook.rgb32[*data++];
+                    cb2 = strip->v4_codebook.rgb32[*data++];
+                    cb3 = strip->v4_codebook.rgb32[*data++];
+                    memcpy(ip3 + 0, cb2 + 2, 8);
+                    memcpy(ip3 + 8, cb3 + 2, 8);
+                    memcpy(ip2 + 0, cb2 + 0, 8);
+                    memcpy(ip2 + 8, cb3 + 0, 8);
+                    memcpy(ip1 + 0, cb0 + 2, 8);
+                    memcpy(ip1 + 8, cb1 + 2, 8);
+                    memcpy(ip0 + 0, cb0 + 0, 8);
+                    memcpy(ip0 + 8, cb1 + 0, 8);
+
+                }
+            }
+
+            ip0 += 16;  ip1 += 16;
+            ip2 += 16;  ip3 += 16;
+        }
+    }
+
+    return 0;
+}
+
+static int cinepak_decode_vectors_rgb24 (CinepakContext *s, cvid_strip *strip,
                                    int chunk_id, int size, const uint8_t *data)
 {
-    const uint8_t   *eod = (data + size);
+    const uint8_t   *eod;
     uint32_t         flag, mask;
     uint8_t         *cb0, *cb1, *cb2, *cb3;
-    int             x, y;
+    int              x, y;
     char            *ip0, *ip1, *ip2, *ip3;
+    int selective_update;
+    int v1_only;
 
-    flag = 0;
-    mask = 0;
+    VECTOR_INTRO
 
-    for (y=strip->y1; y < strip->y2; y+=4) {
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+                                strip->x1*3 + y*s->frame->linesize[0];
+
+#define POINTER_TYPE uint8_t
+    VECTOR_DO
+#undef POINTER_TYPE
+
+                    p = strip->v1_codebook.rgb24[*data++] + 6;
+                    memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
+                    memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
+                    p += 3; /* ... + 9 */
+                    memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
+                    memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
+                    p -= 9; /* ... + 0 */
+                    memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
+                    memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
+                    p += 3; /* ... + 3 */
+                    memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
+                    memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
+
+                } else if (flag & mask) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    cb0 = strip->v4_codebook.rgb24[*data++];
+                    cb1 = strip->v4_codebook.rgb24[*data++];
+                    cb2 = strip->v4_codebook.rgb24[*data++];
+                    cb3 = strip->v4_codebook.rgb24[*data++];
+                    memcpy(ip3 + 0, cb2 + 6, 6);
+                    memcpy(ip3 + 6, cb3 + 6, 6);
+                    memcpy(ip2 + 0, cb2 + 0, 6);
+                    memcpy(ip2 + 6, cb3 + 0, 6);
+                    memcpy(ip1 + 0, cb0 + 6, 6);
+                    memcpy(ip1 + 6, cb1 + 6, 6);
+                    memcpy(ip0 + 0, cb0 + 0, 6);
+                    memcpy(ip0 + 6, cb1 + 0, 6);
+
+                }
+            }
+
+            ip0 += 12;  ip1 += 12;
+            ip2 += 12;  ip3 += 12;
+        }
+    }
+
+    return 0;
+}
+
+static int cinepak_decode_vectors_rgb565 (CinepakContext *s, cvid_strip *strip,
+                                   int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t   *eod;
+    uint32_t         flag, mask;
+    uint16_t        *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
+    int selective_update;
+    int v1_only;
+
+    VECTOR_INTRO
+
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+                                strip->x1*2 + y*s->frame->linesize[0];
+
+#define POINTER_TYPE uint16_t
+    VECTOR_DO
+#undef POINTER_TYPE
+
+                    p = strip->v1_codebook.rgb565[*data++];
+                    * (uint16_t *)ip3    = *((uint16_t *)ip3+1) =
+                    * (uint16_t *)ip2    = *((uint16_t *)ip2+1) = p[2];
+                    *((uint16_t *)ip3+2) = *((uint16_t *)ip3+3) =
+                    *((uint16_t *)ip2+2) = *((uint16_t *)ip2+3) = p[3];
+                    * (uint16_t *)ip1    = *((uint16_t *)ip1+1) =
+                    * (uint16_t *)ip0    = *((uint16_t *)ip0+1) = p[0];
+                    *((uint16_t *)ip1+2) = *((uint16_t *)ip1+3) =
+                    *((uint16_t *)ip0+2) = *((uint16_t *)ip0+3) = p[1];
+
+                } else if (flag & mask) {
+                    if ((data + 4) > eod)
+                        return AVERROR_INVALIDDATA;
+
+                    cb0 = strip->v4_codebook.rgb565[*data++];
+                    cb1 = strip->v4_codebook.rgb565[*data++];
+                    cb2 = strip->v4_codebook.rgb565[*data++];
+                    cb3 = strip->v4_codebook.rgb565[*data++];
+                    memcpy(ip3 + 0, cb2 + 2, 4);
+                    memcpy(ip3 + 4, cb3 + 2, 4);
+                    memcpy(ip2 + 0, cb2 + 0, 4);
+                    memcpy(ip2 + 4, cb3 + 0, 4);
+                    memcpy(ip1 + 0, cb0 + 2, 4);
+                    memcpy(ip1 + 4, cb1 + 2, 4);
+                    memcpy(ip0 + 0, cb0 + 0, 4);
+                    memcpy(ip0 + 4, cb1 + 0, 4);
+
+                }
+            }
+
+            ip0 += 8;  ip1 += 8;
+            ip2 += 8;  ip3 += 8;
+        }
+    }
+
+    return 0;
+}
+
+static int cinepak_decode_vectors_yuv420 (CinepakContext *s, cvid_strip *strip,
+                                   int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t   *eod;
+    uint32_t         flag, mask;
+    uint8_t         *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
+    char            *ip0, *ip1, *ip2, *ip3,
+                    *up01, *up23, *vp01, *vp23;
+    int selective_update;
+    int v1_only;
+
+    VECTOR_INTRO
 
-/* take care of y dimension not being multiple of 4, such streams exist */
         ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
-          (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]);
+                                strip->x1*3 + y*s->frame->linesize[0];
+        up01 = up23 = s->frame->data[1] + strip->x1 + y/2*s->frame->linesize[1];
+        vp01 = vp23 = s->frame->data[2] + strip->x1 + y/2*s->frame->linesize[2];
         if(s->avctx->height - y > 1) {
             ip1 = ip0 + s->frame->linesize[0];
             if(s->avctx->height - y > 2) {
                 ip2 = ip1 + s->frame->linesize[0];
+                up23 = up01 + s->frame->linesize[1];
+                vp23 = vp01 + s->frame->linesize[2];
                 if(s->avctx->height - y > 3) {
                     ip3 = ip2 + s->frame->linesize[0];
                 }
             }
         }
+
 /* to get the correct picture for not-multiple-of-4 cases let us fill each
  * block from the bottom up, thus possibly overwriting the bottommost line
  * more than once but ending with the correct data in place
  * (instead of in-loop checking) */
 
-        for (x=strip->x1; x < strip->x2; x+=4) {
-            if ((chunk_id & 0x01) && !(mask >>= 1)) {
-                if ((data + 4) > eod)
-                    return AVERROR_INVALIDDATA;
+#define POINTER_TYPE uint8_t
+        VECTOR_STREAM_PARSING
+#undef POINTER_TYPE
 
-                flag  = AV_RB32 (data);
-                data += 4;
-                mask  = 0x80000000;
-            }
+                    p = strip->v1_codebook.yuv420[*data++];
+                    ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2];
+                    ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3];
+                    ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
+                    ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1];
+                    p += 4;
+                    up01[0] = up01[1] = up23[0] = up23[1] = *p++;
+                    vp01[0] = vp01[1] = vp23[0] = vp23[1] = *p++;
 
-            if (!(chunk_id & 0x01) || (flag & mask)) {
-                if (!(chunk_id & 0x02) && !(mask >>= 1)) {
+                } else if (flag & mask) {
                     if ((data + 4) > eod)
                         return AVERROR_INVALIDDATA;
 
-                    flag  = AV_RB32 (data);
-                    data += 4;
-                    mask  = 0x80000000;
+                    cb0 = strip->v4_codebook.yuv420[*data++];
+                    cb1 = strip->v4_codebook.yuv420[*data++];
+                    cb2 = strip->v4_codebook.yuv420[*data++];
+                    cb3 = strip->v4_codebook.yuv420[*data++];
+                    memcpy(ip3 + 0, cb2 + 2, 2);
+                    memcpy(ip3 + 2, cb3 + 2, 2);
+                    memcpy(ip2 + 0, cb2 + 0, 2);
+                    memcpy(ip2 + 2, cb3 + 0, 2);
+                    memcpy(ip1 + 0, cb0 + 2, 2);
+                    memcpy(ip1 + 2, cb1 + 2, 2);
+                    memcpy(ip0 + 0, cb0 + 0, 2);
+                    memcpy(ip0 + 2, cb1 + 0, 2);
+                    cb0 += 4; cb1 += 4; cb2 += 4; cb3 += 4;
+                    up23[0] = *cb2++; vp23[0] = *cb2;
+                    up23[1] = *cb3++; vp23[1] = *cb3;
+                    up01[0] = *cb0++; vp01[0] = *cb0;
+                    up01[1] = *cb1++; vp01[1] = *cb1;
+
                 }
+            }
 
-                if ((chunk_id & 0x02) || (~flag & mask)) {
-                    uint8_t *p;
-                    if (data >= eod)
-                        return AVERROR_INVALIDDATA;
+            ip0 += 4;  ip1 += 4;
+            ip2 += 4;  ip3 += 4;
+            up01 += 2; up23 += 2;
+            vp01 += 2; vp23 += 2;
+        }
+    }
 
-                    p = strip->v1_codebook[*data++];
-                    if (s->palette_video) {
-                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6];
-                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9];
-                        ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
-                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3];
-                    } else {
-                        p += 6;
-                        memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
-                        memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
-                        p += 3; /* ... + 9 */
-                        memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
-                        memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
-                        p -= 9; /* ... + 0 */
-                        memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
-                        memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
-                        p += 3; /* ... + 3 */
-                        memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
-                        memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
-                    }
+    return 0;
+}
+
+static int cinepak_decode_vectors_pal8 (CinepakContext *s, cvid_strip *strip,
+                                 int chunk_id, int size, const uint8_t *data)
+{
+    const uint8_t   *eod;
+    uint32_t         flag, mask;
+    uint8_t         *cb0, *cb1, *cb2, *cb3;
+    int              x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
+    int selective_update;
+    int v1_only;
+
+    VECTOR_INTRO
+
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+                                strip->x1 + y*s->frame->linesize[0];
+
+#define POINTER_TYPE uint8_t
+    VECTOR_DO
+#undef POINTER_TYPE
+
+                    p = strip->v1_codebook.pal8[*data++];
+                    ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[2];
+                    ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[3];
+                    ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
+                    ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[1];
 
                 } else if (flag & mask) {
+                    uint8_t *p;
                     if ((data + 4) > eod)
                         return AVERROR_INVALIDDATA;
 
-                    cb0 = strip->v4_codebook[*data++];
-                    cb1 = strip->v4_codebook[*data++];
-                    cb2 = strip->v4_codebook[*data++];
-                    cb3 = strip->v4_codebook[*data++];
-                    if (s->palette_video) {
-                        uint8_t *p;
-                        p = ip3;
-                        *p++ = cb2[6];
-                        *p++ = cb2[9];
-                        *p++ = cb3[6];
-                        *p   = cb3[9];
-                        p = ip2;
-                        *p++ = cb2[0];
-                        *p++ = cb2[3];
-                        *p++ = cb3[0];
-                        *p   = cb3[3];
-                        p = ip1;
-                        *p++ = cb0[6];
-                        *p++ = cb0[9];
-                        *p++ = cb1[6];
-                        *p   = cb1[9];
-                        p = ip0;
-                        *p++ = cb0[0];
-                        *p++ = cb0[3];
-                        *p++ = cb1[0];
-                        *p   = cb1[3];
-                    } else {
-                        memcpy(ip3 + 0, cb2 + 6, 6);
-                        memcpy(ip3 + 6, cb3 + 6, 6);
-                        memcpy(ip2 + 0, cb2 + 0, 6);
-                        memcpy(ip2 + 6, cb3 + 0, 6);
-                        memcpy(ip1 + 0, cb0 + 6, 6);
-                        memcpy(ip1 + 6, cb1 + 6, 6);
-                        memcpy(ip0 + 0, cb0 + 0, 6);
-                        memcpy(ip0 + 6, cb1 + 0, 6);
-                    }
+                    cb0 = strip->v4_codebook.pal8[*data++];
+                    cb1 = strip->v4_codebook.pal8[*data++];
+                    cb2 = strip->v4_codebook.pal8[*data++];
+                    cb3 = strip->v4_codebook.pal8[*data++];
+                    p = ip3;
+                    *p++ = cb2[2];
+                    *p++ = cb2[3];
+                    *p++ = cb3[2];
+                    *p   = cb3[3];
+                    p = ip2;
+                    *p++ = cb2[0];
+                    *p++ = cb2[1];
+                    *p++ = cb3[0];
+                    *p   = cb3[1];
+                    p = ip1;
+                    *p++ = cb0[2];
+                    *p++ = cb0[3];
+                    *p++ = cb1[2];
+                    *p   = cb1[3];
+                    p = ip0;
+                    *p++ = cb0[0];
+                    *p++ = cb0[1];
+                    *p++ = cb1[0];
+                    *p   = cb1[1];
 
                 }
             }
 
-            if (s->palette_video) {
-                ip0 += 4;  ip1 += 4;
-                ip2 += 4;  ip3 += 4;
-            } else {
-                ip0 += 12;  ip1 += 12;
-                ip2 += 12;  ip3 += 12;
-            }
+            ip0 += 4;  ip1 += 4;
+            ip2 += 4;  ip3 += 4;
         }
     }
 
@@ -290,22 +766,22 @@ static int cinepak_decode_strip (CinepakContext *s,
         case 0x21:
         case 0x24:
         case 0x25:
-            cinepak_decode_codebook (strip->v4_codebook, chunk_id,
-                chunk_size, data);
+            s->decode_codebook(s, &strip->v4_codebook,
+                chunk_id, chunk_size, data);
             break;
 
         case 0x22:
         case 0x23:
         case 0x26:
         case 0x27:
-            cinepak_decode_codebook (strip->v1_codebook, chunk_id,
-                chunk_size, data);
+            s->decode_codebook (s, &strip->v1_codebook,
+                chunk_id, chunk_size, data);
             break;
 
         case 0x30:
         case 0x31:
         case 0x32:
-            return cinepak_decode_vectors (s, strip, chunk_id,
+            return s->decode_vectors (s, strip, chunk_id,
                 chunk_size, data);
         }
 
@@ -385,9 +861,9 @@ static int cinepak_decode (CinepakContext *s)
         strip_size = ((s->data + strip_size) > eod) ? (eod - s->data) : strip_size;
 
         if ((i > 0) && !(frame_flags & 0x01)) {
-            memcpy (s->strips[i].v4_codebook, s->strips[i-1].v4_codebook,
+            memcpy (&s->strips[i].v4_codebook, &s->strips[i-1].v4_codebook,
                 sizeof(s->strips[i].v4_codebook));
-            memcpy (s->strips[i].v1_codebook, s->strips[i-1].v1_codebook,
+            memcpy (&s->strips[i].v1_codebook, &s->strips[i-1].v1_codebook,
                 sizeof(s->strips[i].v1_codebook));
         }
 
@@ -402,23 +878,85 @@ static int cinepak_decode (CinepakContext *s)
     return 0;
 }
 
+/* given a palettized input */
+static const enum AVPixelFormat pixfmt_list[] = {
+    AV_PIX_FMT_RGB24,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_RGB565,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_PAL8, /* only when input is palettized */
+    AV_PIX_FMT_NONE
+};
+
+/* given a non-palettized input */
+static const enum AVPixelFormat pixfmt_list_2[] = {
+    AV_PIX_FMT_RGB24,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_RGB565,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
 static av_cold int cinepak_decode_init(AVCodecContext *avctx)
 {
     CinepakContext *s = avctx->priv_data;
 
+/* we take advantage of VQ to efficiently support
+ * multiple output formats */
+
     s->avctx = avctx;
     s->width = (avctx->width + 3) & ~3;
     s->height = (avctx->height + 3) & ~3;
 
     s->sega_film_skip_bytes = -1;  /* uninitialized state */
 
-    // check for paletted data
-    if (avctx->bits_per_coded_sample != 8) {
-        s->palette_video = 0;
-        avctx->pix_fmt = AV_PIX_FMT_RGB24;
-    } else {
-        s->palette_video = 1;
-        avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    /* check for paletted data */
+    s->palette_video = (avctx->bits_per_coded_sample == 8);
+
+/* If you are in a crisis, needing to influence the format choice
+ * in the decoder to workaround dumb/misbehaving applications,
+ * here would be the place to insert checking some dedicated
+ * environment variable -- rl
+ * NOTE that ffmpeg API policies as of 2017 STRONGLY DISCOURAGE
+ * taking such freedoms, do not bother the developers with this,
+ * even as an example such code is not welcome */
+
+    if (s->out_pixfmt != AV_PIX_FMT_NONE) /* the option is set to something */
+        avctx->pix_fmt = s->out_pixfmt;
+    else
+        if (s->palette_video)
+            avctx->pix_fmt = ff_get_format(avctx, pixfmt_list);
+        else
+            avctx->pix_fmt = ff_get_format(avctx, pixfmt_list_2);
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_RGB32:
+        s->decode_codebook = cinepak_decode_codebook_rgb32;
+        s->decode_vectors  = cinepak_decode_vectors_rgb32;
+        break;
+    case AV_PIX_FMT_RGB24:
+        s->decode_codebook = cinepak_decode_codebook_rgb24;
+        s->decode_vectors  = cinepak_decode_vectors_rgb24;
+        break;
+    case AV_PIX_FMT_RGB565:
+        s->decode_codebook = cinepak_decode_codebook_rgb565;
+        s->decode_vectors  = cinepak_decode_vectors_rgb565;
+        break;
+    case AV_PIX_FMT_YUV420P:
+        s->decode_codebook = cinepak_decode_codebook_yuv420;
+        s->decode_vectors  = cinepak_decode_vectors_yuv420;
+        break;
+    case AV_PIX_FMT_PAL8:
+        if (!s->palette_video) {
+            av_log(avctx, AV_LOG_ERROR, "Palettized output not supported without palettized input\n");
+            return AVERROR(EINVAL);
+        }
+        s->decode_codebook = cinepak_decode_codebook_pal8;
+        s->decode_vectors  = cinepak_decode_vectors_pal8;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format %d\n", avctx->pix_fmt);
+        return AVERROR(EINVAL);
     }
 
     s->frame = av_frame_alloc();
@@ -457,7 +995,7 @@ static int cinepak_decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "cinepak_decode failed\n");
     }
 
-    if (s->palette_video)
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
         memcpy (s->frame->data[1], s->pal, AVPALETTE_SIZE);
 
     if ((ret = av_frame_ref(data, s->frame)) < 0)
@@ -488,4 +1026,6 @@ AVCodec ff_cinepak_decoder = {
     .close          = cinepak_decode_end,
     .decode         = cinepak_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .pix_fmts       = pixfmt_list,
+    .priv_class     = &cinepak_class,
 };
-- 
2.11.0