[FFmpeg-devel] [PATCH] avcodec/ffv1: WIP Single precission float support

Wed Mar 19 02:34:53 EET 2025

No, this does not work yet, just posting for anyone curious

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>
---
 libavcodec/ffv1.h             |   5 +-
 libavcodec/ffv1_parse.c       |  10 ++
 libavcodec/ffv1dec.c          |  36 ++++--
 libavcodec/ffv1dec_template.c |  30 +++--
 libavcodec/ffv1enc.c          | 211 +++++++++++++++++++++++++++++++++-
 5 files changed, 272 insertions(+), 20 deletions(-)

diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
index dd8a236efad..4db9a303bf1 100644
--- a/libavcodec/ffv1.h
+++ b/libavcodec/ffv1.h
@@ -109,7 +109,10 @@ typedef struct FFV1SliceContext {
             uint64_t (*rc_stat2[MAX_QUANT_TABLES])[32][2];
         };
     };
-    uint16_t   fltmap[4][65536];
+    union {
+        uint16_t   fltmap  [4][65536]; //halffloat encode & decode
+        uint32_t   fltmap32[4][65536]; //float decode
+    };
 } FFV1SliceContext;
 
 typedef struct FFV1Context {
diff --git a/libavcodec/ffv1_parse.c b/libavcodec/ffv1_parse.c
index 9745f9de694..10f3652ff51 100644
--- a/libavcodec/ffv1_parse.c
+++ b/libavcodec/ffv1_parse.c
@@ -419,6 +419,16 @@ int ff_ffv1_parse_header(FFV1Context *f, RangeCoder *c, uint8_t *state)
             } else
                 f->pix_fmt = AV_PIX_FMT_GBRAP16;
             f->use32bit = 1;
+        } else if (f->avctx->bits_per_raw_sample == 32 && !f->transparency) {
+            if (f->flt) {
+                f->pix_fmt = AV_PIX_FMT_GBRPF32;
+            }
+            f->use32bit = 1;
+        } else if (f->avctx->bits_per_raw_sample == 32 && f->transparency) {
+            if (f->flt) {
+                f->pix_fmt = AV_PIX_FMT_GBRAPF32;
+            }
+            f->use32bit = 1;
         }
     } else {
         av_log(f->avctx, AV_LOG_ERROR, "colorspace not supported\n");
diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
index 75fb5ae2f69..d45aabbbde8 100644
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@@ -250,6 +250,16 @@ static int decode_slice_header(const FFV1Context *f,
             sc->rawlsb = ff_ffv1_get_symbol(c, state, 0);
         }
     }
+    if (f->avctx->bits_per_raw_sample == 32) {
+        if (!sc->remap) {
+            av_log(f->avctx, AV_LOG_ERROR, "unsupported remap\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if (sc->slice_width * sc->slice_height > 65536) {
+            av_log(f->avctx, AV_LOG_ERROR, "32bit needs remap\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
 
     return 0;
 }
@@ -266,28 +276,38 @@ static void slice_set_damaged(FFV1Context *f, FFV1SliceContext *sc)
 
 static int decode_remap(FFV1Context *f, FFV1SliceContext *sc)
 {
-    int flip = sc->remap == 2 ? 0x7FFF : 0;
+    unsigned int end = f->avctx->bits_per_raw_sample == 32 ? 0xFFFFFFFF : 0xFFFF;
+    int flip = sc->remap == 2 ? (end>>1) : 0;
+    int sign = (end>>1)+1;
 
     for (int p= 0; p < 1 + 2*f->chroma_planes + f->transparency; p++) {
         int j = 0;
         int lu = 0;
         uint8_t state[2][32];
+        int64_t i;
         memset(state, 128, sizeof(state));
-
-        for (int i= 0; i<65536; i++) {
-            int run = get_symbol_inline(&sc->c, state[lu], 0);
-            if (run > 65536U - i)
+        for (i=0; i <= end ; i++) {
+            unsigned run = get_symbol_inline(&sc->c, state[lu], 0);
+            if (run > end - i + 1)
                 return AVERROR_INVALIDDATA;
             if (lu) {
                 lu ^= !run;
                 while (run--) {
-                    sc->fltmap[p][j++] = i ^ ((i&0x8000) ? 0 : flip);
+                    if (end == 0xFFFF) {
+                        sc->fltmap  [p][j++] = i ^ ((i&    0x8000) ? 0 : flip);
+                    } else
+                        sc->fltmap32[p][j++] = i ^ ((i&0x80000000) ? 0 : flip);
                     i++;
                 }
             } else {
                 i += run;
-                if (i != 65536)
-                    sc->fltmap[p][j++] = i ^ ((i&0x8000) ? 0 : flip);
+                if (i <= end) {
+                    if (end == 0xFFFF) {
+                        sc->fltmap  [p][j++] = i ^ ((i&    0x8000) ? 0 : flip);
+                    } else {
+                        sc->fltmap32[p][j++] = i ^ ((i&0x80000000) ? 0 : flip);
+                    }
+                }
                 lu ^= !run;
             }
         }
diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
index f9499931b1d..4e500ab5212 100644
--- a/libavcodec/ffv1dec_template.c
+++ b/libavcodec/ffv1dec_template.c
@@ -150,7 +150,7 @@ static int RENAME(decode_rgb_frame)(FFV1Context *f, FFV1SliceContext *sc,
     int x, y, p;
     TYPE *sample[4][2];
     int lbd    = f->avctx->bits_per_raw_sample <= 8;
-    int bits   = f->avctx->bits_per_raw_sample > 0 ? f->avctx->bits_per_raw_sample : 8;
+    int bits   = f->avctx->bits_per_raw_sample > 0 ? FFMIN(f->avctx->bits_per_raw_sample, 16) : 8;
     int offset = 1 << bits;
     int transparency = f->transparency;
     int ac = f->ac;
@@ -198,16 +198,30 @@ static int RENAME(decode_rgb_frame)(FFV1Context *f, FFV1SliceContext *sc,
                 r += g;
             }
             if (sc->remap) {
-                r = sc->fltmap[0][r & 0xFFFF];
-                g = sc->fltmap[1][g & 0xFFFF];
-                b = sc->fltmap[2][b & 0xFFFF];
-                if (transparency)
-                    a = sc->fltmap[3][a & 0xFFFF];
+                if (f->avctx->bits_per_raw_sample == 32) {
+                    r = sc->fltmap32[0][r & 0xFFFF];
+                    g = sc->fltmap32[1][g & 0xFFFF];
+                    b = sc->fltmap32[2][b & 0xFFFF];
+                    if (transparency)
+                        a = sc->fltmap32[3][a & 0xFFFF];
+                } else {
+                    r = sc->fltmap[0][r & 0xFFFF];
+                    g = sc->fltmap[1][g & 0xFFFF];
+                    b = sc->fltmap[2][b & 0xFFFF];
+                    if (transparency)
+                        a = sc->fltmap[3][a & 0xFFFF];
+                }
             }
 
-            if (lbd)
+            if (lbd) {
                 *((uint32_t*)(src[0] + x*4 + stride[0]*y)) = b + ((unsigned)g<<8) + ((unsigned)r<<16) + ((unsigned)a<<24);
-            else if (sizeof(TYPE) == 4 || transparency) {
+            } else if (f->avctx->bits_per_raw_sample == 32) {
+                *((uint32_t*)(src[0] + x*4 + stride[0]*y)) = g;
+                *((uint32_t*)(src[1] + x*4 + stride[1]*y)) = b;
+                *((uint32_t*)(src[2] + x*4 + stride[2]*y)) = r;
+                if (transparency)
+                    *((uint32_t*)(src[3] + x*4 + stride[3]*y)) = a;
+            } else if (sizeof(TYPE) == 4 || transparency) {
                 *((uint16_t*)(src[0] + x*2 + stride[0]*y)) = g;
                 *((uint16_t*)(src[1] + x*2 + stride[1]*y)) = b;
                 *((uint16_t*)(src[2] + x*2 + stride[2]*y)) = r;
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index 64add25b407..58e1227bd6c 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -31,6 +31,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/qsort.h"
 
 #include "avcodec.h"
 #include "encode.h"
@@ -576,6 +577,9 @@ int ff_ffv1_encode_determine_slices(AVCodecContext *avctx)
                 continue;
             if (maxw * maxh * (int64_t)(s->bits_per_raw_sample+1) * plane_count > 8<<24)
                 continue;
+            if (s->bits_per_raw_sample == 32)
+                if (maxw * maxh > 65536)
+                    continue;
             if (s->version < 4)
                 if (  ff_need_new_slices(avctx->width , s->num_h_slices, s->chroma_h_shift)
                     ||ff_need_new_slices(avctx->height, s->num_v_slices, s->chroma_v_shift))
@@ -920,6 +924,10 @@ av_cold int ff_ffv1_encode_setup_plane_info(AVCodecContext *avctx,
     case AV_PIX_FMT_GBRAPF16:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
             s->bits_per_raw_sample = 16;
+    case AV_PIX_FMT_GBRPF32:
+    case AV_PIX_FMT_GBRAPF32:
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 32;
         else if (!s->bits_per_raw_sample)
             s->bits_per_raw_sample = avctx->bits_per_raw_sample;
         s->transparency = !!(desc->flags & AV_PIX_FMT_FLAG_ALPHA);
@@ -942,6 +950,10 @@ av_cold int ff_ffv1_encode_setup_plane_info(AVCodecContext *avctx,
 
     if (s->remap_mode < 0)
         s->remap_mode = s->flt ? 2 : 0;
+    if (s->remap_mode == 0 && s->bits_per_raw_sample == 32) {
+        av_log(avctx, AV_LOG_ERROR, "32bit requires remap\n");
+        return AVERROR(EINVAL);
+    }
 
     return av_pix_fmt_get_chroma_sub_sample(pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
 }
@@ -1158,7 +1170,7 @@ static void choose_rct_params(const FFV1Context *f, FFV1SliceContext *sc,
     sc->slice_rct_ry_coef = rct_y_coeff[best][0];
 }
 
-static void encode_remap(FFV1Context *f, FFV1SliceContext *sc)
+static void encode_histogram_remap(FFV1Context *f, FFV1SliceContext *sc)
 {
     int flip = sc->remap == 2 ? 0x7FFF : 0;
 
@@ -1188,6 +1200,180 @@ static void encode_remap(FFV1Context *f, FFV1SliceContext *sc)
     }
 }
 
+typedef struct Unit {
+    uint32_t val; //this is unneeded if you accept a dereference on each access
+    int ndx; //unsigned 16 bit would suffice but make code more complex/slow
+} Unit;
+
+static void load_rgb_float32_frame(FFV1Context *f, FFV1SliceContext *sc,
+                                   const uint8_t *src[4],
+                                   int w, int h, const int stride[4],
+                                   Unit unit[4][65536])
+{
+    int x, y;
+    int transparency = f->transparency;
+    int i = 0;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < w; x++) {
+            int b, g, r, av_uninit(a);
+            int gi = x*4 + stride[0]*y;
+            int bi = x*4 + stride[1]*y;
+            int ri = x*4 + stride[2]*y;
+
+            g = *((const uint32_t *)(src[0] + gi));
+            b = *((const uint32_t *)(src[1] + bi));
+            r = *((const uint32_t *)(src[2] + ri));
+            if (transparency)
+                a = *((const uint32_t *)(src[3] + x*4 + stride[3]*y));
+
+            // We cannot build a histogram as we do for 16bit, we need a bit of magic here
+            // Its possible to reduce the memory needed at the cost of more dereferencing
+            unit[0][i].val = r;
+            unit[0][i].ndx = ri;
+
+            unit[1][i].val = g;
+            unit[1][i].ndx = gi;
+
+            unit[2][i].val = b;
+            unit[2][i].ndx = bi;
+
+            if (transparency) {
+                unit[3][i].val = a;
+                unit[3][i].ndx = x*4 + stride[3]*y;
+            }
+            i++;
+        }
+    }
+
+    //TODO switch to radix sort
+#define CMP(A,B) ((A)->val - (int64_t)(B)->val)
+    AV_QSORT(unit[0], i, Unit, CMP);
+    AV_QSORT(unit[1], i, Unit, CMP);
+    AV_QSORT(unit[2], i, Unit, CMP);
+    if (transparency)
+        AV_QSORT(unit[3], i, Unit, CMP);
+}
+
+static void encode_float32_remap(FFV1Context *f, FFV1SliceContext *sc,
+                                 uint8_t *src[4], Unit unit[4][65536])
+{
+    int flip = sc->remap == 2 ? 0x7FFF : 0;
+    int pixel_num = sc->slice_width * sc->slice_height;
+
+    av_assert0 (pixel_num <= 65536);
+
+    for (int p= 0; p < 1 + 2*f->chroma_planes + f->transparency; p++) {
+        int lu = 0;
+        uint8_t state[2][32];
+        int run = 0;
+        int64_t last_val = -1;
+        int compact_index = 0;
+
+        memset(state, 128, sizeof(state));
+        for (int i= 0; i<pixel_num+1; i++) {
+            int64_t val;
+            if (i == pixel_num) {
+                if (last_val == 0xFFFFFFFF) {
+                    break; //i think
+                } else {
+                    val = 1LL<<32;
+                }
+            } else
+                val = unit[p][i].val;
+//             if (flip) TODO
+//                 val ^= (XX&0x80000000) ? 0 : 0x7FFFFFFF;
+
+            *((uint32_t *)(src[p] + unit[p][i].ndx)) = compact_index;
+
+            if (last_val != val) {
+                av_assert2(last_val < val);
+                if (lu) {
+                    if (val - last_val == 1) {
+                        run ++;
+                        last_val = val;
+                    } else {
+                        av_log(0,0, "R%d %6d\n", lu, (int)(run));
+                        put_symbol_inline(&sc->c, state[lu], run, 0, NULL, NULL);
+                        if (run == 0)
+                            lu ^= 1;
+                        run = 0;
+                        i--; // we did not encode val so we need to backstep
+                        last_val ++;
+                        continue;
+                    }
+                } else {
+                    av_assert2(run == 0);
+                    av_log(0,0, "R%d %6d\n", lu, (int)(val - last_val - 1));
+                    put_symbol_inline(&sc->c, state[lu], val - last_val - 1, 0, NULL, NULL);
+                    if (val - last_val == 1)
+                        lu ^= 1;
+                    last_val = val;
+                }
+                compact_index ++;
+            }
+        }
+    }
+}
+
+//TODO once this is working consider factorizing with the 16bit integer version and see how it looks if its too messy or better
+static int encode_float32_rgb_frame(FFV1Context *f, FFV1SliceContext *sc,
+                                    const uint8_t *src[4],
+                                    int w, int h, const int stride[4], int ac)
+{
+    int x, y, p, i;
+    const int ring_size = f->context_model ? 3 : 2;
+    int32_t *sample[4][3];
+    const int pass1 = !!(f->avctx->flags & AV_CODEC_FLAG_PASS1);
+    int bits   = 16;  //TODO explain this in the specifciation, we have 32bits in but really encode max 16
+    int offset = 1 << bits;
+    int transparency = f->transparency;
+
+    sc->run_index = 0;
+
+    memset(RENAME(sc->sample_buffer), 0, ring_size * MAX_PLANES *
+           (w + 6) * sizeof(*RENAME(sc->sample_buffer)));
+
+    for (y = 0; y < h; y++) {
+        for (i = 0; i < ring_size; i++)
+            for (p = 0; p < MAX_PLANES; p++)
+                sample[p][i]= RENAME(sc->sample_buffer) + p*ring_size*(w+6) + ((h+i-y)%ring_size)*(w+6) + 3;
+
+        for (x = 0; x < w; x++) {
+            int b, g, r, av_uninit(a);
+            g = *((const uint32_t *)(src[0] + x*4 + stride[0]*y));
+            b = *((const uint32_t *)(src[1] + x*4 + stride[1]*y));
+            r = *((const uint32_t *)(src[2] + x*4 + stride[2]*y));
+            if (transparency)
+                a = *((const uint32_t *)(src[3] + x*4 + stride[3]*y));
+
+            if (sc->slice_coding_mode != 1) {
+                b -= g;
+                r -= g;
+                g += (b * sc->slice_rct_by_coef + r * sc->slice_rct_ry_coef) >> 2;
+                b += offset;
+                r += offset;
+            }
+
+            sample[0][0][x] = g;
+            sample[1][0][x] = b;
+            sample[2][0][x] = r;
+            sample[3][0][x] = a;
+        }
+        for (p = 0; p < 3 + transparency; p++) {
+            int ret;
+            sample[p][0][-1] = sample[p][1][0  ];
+            sample[p][1][ w] = sample[p][1][w-1];
+            ret = encode_line32(f, sc, f->avctx, w, sample[p], (p + 1) / 2,
+                                bits + (sc->slice_coding_mode != 1), ac, pass1);
+            if (ret < 0)
+                return ret;
+        }
+    }
+    return 0;
+}
+
+
 static int encode_slice(AVCodecContext *c, void *arg)
 {
     FFV1SliceContext *sc = arg;
@@ -1226,6 +1412,10 @@ retry:
     }
 
     if (sc->remap) {
+      //Both the 16bit and 32bit remap do exactly the same thing but with 16bits we can
+      //Implement this using a "histogram" while for 32bit that would be gb sized, thus a more
+      //complex implementation sorting pairs is used.
+      if (f->bits_per_raw_sample != 32) {
         if (f->colorspace == 0 && c->pix_fmt != AV_PIX_FMT_YA8 && c->pix_fmt != AV_PIX_FMT_YAF16) {
             const int cx            = x >> f->chroma_h_shift;
             const int cy            = y >> f->chroma_v_shift;
@@ -1249,7 +1439,12 @@ retry:
         } else
             load_rgb_frame  (f, sc, planes, width, height, p->linesize);
 
-        encode_remap(f, sc);
+        encode_histogram_remap(f, sc);
+      } else {
+            Unit pairs[4][65536];
+            load_rgb_float32_frame(f, sc, planes, width, height, p->linesize, pairs);
+            encode_float32_remap(f, sc, planes, pairs);
+      }
     }
 
     if (ac == AC_GOLOMB_RICE) {
@@ -1281,6 +1476,8 @@ retry:
     } else if (c->pix_fmt == AV_PIX_FMT_YA8 || c->pix_fmt == AV_PIX_FMT_YAF16) {
         ret  = encode_plane(f, sc, p->data[0] +           ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 0, 2, ac);
         ret |= encode_plane(f, sc, p->data[0] + (ps>>1) + ps*x + y*p->linesize[0], width, height, p->linesize[0], 1, 1, 2, ac);
+    } else if (f->bits_per_raw_sample == 32) {
+        ret = encode_float32_rgb_frame(f, sc, planes, width, height, p->linesize, ac);
     } else if (f->use32bit) {
         ret = encode_rgb_frame32(f, sc, planes, width, height, p->linesize, ac);
     } else {
@@ -1388,6 +1585,14 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         return 0;
     }
 
+    if (f->bits_per_raw_sample == 32 && f->remap_mode) {
+        //To reduce the needed memory we use the input frame (which is generally freely available)
+        //TODO use fltmap instead of the frame so as not to have to make it writable
+        int ret = av_frame_make_writable(pict);
+        if (ret < 0)
+            return ret;
+    }
+
     /* Maximum packet size */
     maxsize = ff_ffv1_encode_buffer_size(avctx);
 
@@ -1567,7 +1772,7 @@ const FFCodec ff_ffv1_encoder = {
         AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV440P12,
         AV_PIX_FMT_YAF16,
         AV_PIX_FMT_GRAYF16,
-        AV_PIX_FMT_GBRPF16),
+        AV_PIX_FMT_GBRPF16, AV_PIX_FMT_GBRPF32),
     .color_ranges   = AVCOL_RANGE_MPEG,
     .p.priv_class   = &ffv1_class,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH,
-- 
2.48.1