[FFmpeg-devel] [PATCH] avcodec/ffv1: WIP Single precission float support
Michael Niedermayer
michael at niedermayer.cc
Wed Mar 19 02:34:53 EET 2025
No, this does not work yet, just posting for anyone curious
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>
---
libavcodec/ffv1.h | 5 +-
libavcodec/ffv1_parse.c | 10 ++
libavcodec/ffv1dec.c | 36 ++++--
libavcodec/ffv1dec_template.c | 30 +++--
libavcodec/ffv1enc.c | 211 +++++++++++++++++++++++++++++++++-
5 files changed, 272 insertions(+), 20 deletions(-)
diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
index dd8a236efad..4db9a303bf1 100644
--- a/libavcodec/ffv1.h
+++ b/libavcodec/ffv1.h
@@ -109,7 +109,10 @@ typedef struct FFV1SliceContext {
uint64_t (*rc_stat2[MAX_QUANT_TABLES])[32][2];
};
};
- uint16_t fltmap[4][65536];
+ union {
+ uint16_t fltmap [4][65536]; //halffloat encode & decode
+ uint32_t fltmap32[4][65536]; //float decode
+ };
} FFV1SliceContext;
typedef struct FFV1Context {
diff --git a/libavcodec/ffv1_parse.c b/libavcodec/ffv1_parse.c
index 9745f9de694..10f3652ff51 100644
--- a/libavcodec/ffv1_parse.c
+++ b/libavcodec/ffv1_parse.c
@@ -419,6 +419,16 @@ int ff_ffv1_parse_header(FFV1Context *f, RangeCoder *c, uint8_t *state)
} else
f->pix_fmt = AV_PIX_FMT_GBRAP16;
f->use32bit = 1;
+ } else if (f->avctx->bits_per_raw_sample == 32 && !f->transparency) {
+ if (f->flt) {
+ f->pix_fmt = AV_PIX_FMT_GBRPF32;
+ }
+ f->use32bit = 1;
+ } else if (f->avctx->bits_per_raw_sample == 32 && f->transparency) {
+ if (f->flt) {
+ f->pix_fmt = AV_PIX_FMT_GBRAPF32;
+ }
+ f->use32bit = 1;
}
} else {
av_log(f->avctx, AV_LOG_ERROR, "colorspace not supported\n");
diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
index 75fb5ae2f69..d45aabbbde8 100644
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@@ -250,6 +250,16 @@ static int decode_slice_header(const FFV1Context *f,
sc->rawlsb = ff_ffv1_get_symbol(c, state, 0);
}
}
+ if (f->avctx->bits_per_raw_sample == 32) {
+ if (!sc->remap) {
+ av_log(f->avctx, AV_LOG_ERROR, "unsupported remap\n");
+ return AVERROR_INVALIDDATA;
+ }
+ if (sc->slice_width * sc->slice_height > 65536) {
+ av_log(f->avctx, AV_LOG_ERROR, "32bit needs remap\n");
+ return AVERROR_INVALIDDATA;
+ }
+ }
return 0;
}
@@ -266,28 +276,38 @@ static void slice_set_damaged(FFV1Context *f, FFV1SliceContext *sc)
static int decode_remap(FFV1Context *f, FFV1SliceContext *sc)
{
- int flip = sc->remap == 2 ? 0x7FFF : 0;
+ unsigned int end = f->avctx->bits_per_raw_sample == 32 ? 0xFFFFFFFF : 0xFFFF;
+ int flip = sc->remap == 2 ? (end>>1) : 0;
+ int sign = (end>>1)+1;
for (int p= 0; p < 1 + 2*f->chroma_planes + f->transparency; p++) {
int j = 0;
int lu = 0;
uint8_t state[2][32];
+ int64_t i;
memset(state, 128, sizeof(state));
-
- for (int i= 0; i<65536; i++) {
- int run = get_symbol_inline(&sc->c, state[lu], 0);
- if (run > 65536U - i)
+ for (i=0; i <= end ; i++) {
+ unsigned run = get_symbol_inline(&sc->c, state[lu], 0);
+ if (run > end - i + 1)
return AVERROR_INVALIDDATA;
if (lu) {
lu ^= !run;
while (run--) {
- sc->fltmap[p][j++] = i ^ ((i&0x8000) ? 0 : flip);
+ if (end == 0xFFFF) {
+ sc->fltmap [p][j++] = i ^ ((i& 0x8000) ? 0 : flip);
+ } else
+ sc->fltmap32[p][j++] = i ^ ((i&0x80000000) ? 0 : flip);
i++;
}
} else {
i += run;
- if (i != 65536)
- sc->fltmap[p][j++] = i ^ ((i&0x8000) ? 0 : flip);
+ if (i <= end) {
+ if (end == 0xFFFF) {
+ sc->fltmap [p][j++] = i ^ ((i& 0x8000) ? 0 : flip);
+ } else {
+ sc->fltmap32[p][j++] = i ^ ((i&0x80000000) ? 0 : flip);
+ }
+ }
lu ^= !run;
}
}
diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
index f9499931b1d..4e500ab5212 100644
--- a/libavcodec/ffv1dec_template.c
+++ b/libavcodec/ffv1dec_template.c
@@ -150,7 +150,7 @@ static int RENAME(decode_rgb_frame)(FFV1Context *f, FFV1SliceContext *sc,
int x, y, p;
TYPE *sample[4][2];
int lbd = f->avctx->bits_per_raw_sample <= 8;
- int bits = f->avctx->bits_per_raw_sample > 0 ? f->avctx->bits_per_raw_sample : 8;
+ int bits = f->avctx->bits_per_raw_sample > 0 ? FFMIN(f->avctx->bits_per_raw_sample, 16) : 8;
int offset = 1 << bits;
int transparency = f->transparency;
int ac = f->ac;
@@ -198,16 +198,30 @@ static int RENAME(decode_rgb_frame)(FFV1Context *f, FFV1SliceContext *sc,
r += g;
}
if (sc->remap) {
- r = sc->fltmap[0][r & 0xFFFF];
- g = sc->fltmap[1][g & 0xFFFF];
- b = sc->fltmap[2][b & 0xFFFF];
- if (transparency)
- a = sc->fltmap[3][a & 0xFFFF];
+ if (f->avctx->bits_per_raw_sample == 32) {
+ r = sc->fltmap32[0][r & 0xFFFF];
+ g = sc->fltmap32[1][g & 0xFFFF];
+ b = sc->fltmap32[2][b & 0xFFFF];
+ if (transparency)
+ a = sc->fltmap32[3][a & 0xFFFF];
+ } else {
+ r = sc->fltmap[0][r & 0xFFFF];
+ g = sc->fltmap[1][g & 0xFFFF];
+ b = sc->fltmap[2][b & 0xFFFF];
+ if (transparency)
+ a = sc->fltmap[3][a & 0xFFFF];
+ }
}
- if (lbd)
+ if (lbd) {
*((uint32_t*)(src[0] + x*4 + stride[0]*y)) = b + ((unsigned)g<<8) + ((unsigned)r<<16) + ((unsigned)a<<24);
- else if (sizeof(TYPE) == 4 || transparency) {
+ } else if (f->avctx->bits_per_raw_sample == 32) {
+ *((uint32_t*)(src[0] + x*4 + stride[0]*y)) = g;
+ *((uint32_t*)(src[1] + x*4 + stride[1]*y)) = b;
+ *((uint32_t*)(src[2] + x*4 + stride[2]*y)) = r;
+ if (transparency)
+ *((uint32_t*)(src[3] + x*4 + stride[3]*y)) = a;
+ } else if (sizeof(TYPE) == 4 || transparency) {
*((uint16_t*)(src[0] + x*2 + stride[0]*y)) = g;
*((uint16_t*)(src[1] + x*2 + stride[1]*y)) = b;
*((uint16_t*)(src[2] + x*2 + stride[2]*y)) = r;
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index 64add25b407..58e1227bd6c 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -31,6 +31,7 @@
#include "libavutil/mem.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
+#include "libavutil/qsort.h"
#include "avcodec.h"
#include "encode.h"
@@ -576,6 +577,9 @@ int ff_ffv1_encode_determine_slices(AVCodecContext *avctx)
continue;
if (maxw * maxh * (int64_t)(s->bits_per_raw_sample+1) * plane_count > 8<<24)
continue;
+ if (s->bits_per_raw_sample == 32)
+ if (maxw * maxh > 65536)
+ continue;
if (s->version < 4)
if ( ff_need_new_slices(avctx->width , s->num_h_slices, s->chroma_h_shift)
||ff_need_new_slices(avctx->height, s->num_v_slices, s->chroma_v_shift))
@@ -920,6 +924,10 @@ av_cold int ff_ffv1_encode_setup_plane_info(AVCodecContext *avctx,
case AV_PIX_FMT_GBRAPF16:
if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
s->bits_per_raw_sample = 16;
+ case AV_PIX_FMT_GBRPF32:
+ case AV_PIX_FMT_GBRAPF32:
+ if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+ s->bits_per_raw_sample = 32;
else if (!s->bits_per_raw_sample)
s->bits_per_raw_sample = avctx->bits_per_raw_sample;
s->transparency = !!(desc->flags & AV_PIX_FMT_FLAG_ALPHA);
@@ -942,6 +950,10 @@ av_cold int ff_ffv1_encode_setup_plane_info(AVCodecContext *avctx,
if (s->remap_mode < 0)
s->remap_mode = s->flt ? 2 : 0;
+ if (s->remap_mode == 0 && s->bits_per_raw_sample == 32) {
+ av_log(avctx, AV_LOG_ERROR, "32bit requires remap\n");
+ return AVERROR(EINVAL);
+ }
return av_pix_fmt_get_chroma_sub_sample(pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
}
@@ -1158,7 +1170,7 @@ static void choose_rct_params(const FFV1Context *f, FFV1SliceContext *sc,
sc->slice_rct_ry_coef = rct_y_coeff[best][0];
}
-static void encode_remap(FFV1Context *f, FFV1SliceContext *sc)
+static void encode_histogram_remap(FFV1Context *f, FFV1SliceContext *sc)
{
int flip = sc->remap == 2 ? 0x7FFF : 0;
@@ -1188,6 +1200,180 @@ static void encode_remap(FFV1Context *f, FFV1SliceContext *sc)
}
}
+typedef struct Unit {
+ uint32_t val; //this is unneeded if you accept a dereference on each access
+ int ndx; //unsigned 16 bit would suffice but make code more complex/slow
+} Unit;
+
+static void load_rgb_float32_frame(FFV1Context *f, FFV1SliceContext *sc,
+ const uint8_t *src[4],
+ int w, int h, const int stride[4],
+ Unit unit[4][65536])
+{
+ int x, y;
+ int transparency = f->transparency;
+ int i = 0;
+
+ for (y = 0; y < h; y++) {
+ for (x = 0; x < w; x++) {
+ int b, g, r, av_uninit(a);
+ int gi = x*4 + stride[0]*y;
+ int bi = x*4 + stride[1]*y;
+ int ri = x*4 + stride[2]*y;
+
+ g = *((const uint32_t *)(src[0] + gi));
+ b = *((const uint32_t *)(src[1] + bi));
+ r = *((const uint32_t *)(src[2] + ri));
+ if (transparency)
+ a = *((const uint32_t *)(src[3] + x*4 + stride[3]*y));
+
+ // We cannot build a histogram as we do for 16bit, we need a bit of magic here
+ // Its possible to reduce the memory needed at the cost of more dereferencing
+ unit[0][i].val = r;
+ unit[0][i].ndx = ri;
+
+ unit[1][i].val = g;
+ unit[1][i].ndx = gi;
+
+ unit[2][i].val = b;
+ unit[2][i].ndx = bi;
+
+ if (transparency) {
+ unit[3][i].val = a;
+ unit[3][i].ndx = x*4 + stride[3]*y;
+ }
+ i++;
+ }
+ }
+
+ //TODO switch to radix sort
+#define CMP(A,B) ((A)->val - (int64_t)(B)->val)
+ AV_QSORT(unit[0], i, Unit, CMP);
+ AV_QSORT(unit[1], i, Unit, CMP);
+ AV_QSORT(unit[2], i, Unit, CMP);
+ if (transparency)
+ AV_QSORT(unit[3], i, Unit, CMP);
+}
+
+static void encode_float32_remap(FFV1Context *f, FFV1SliceContext *sc,
+ uint8_t *src[4], Unit unit[4][65536])
+{
+ int flip = sc->remap == 2 ? 0x7FFF : 0;
+ int pixel_num = sc->slice_width * sc->slice_height;
+
+ av_assert0 (pixel_num <= 65536);
+
+ for (int p= 0; p < 1 + 2*f->chroma_planes + f->transparency; p++) {
+ int lu = 0;
+ uint8_t state[2][32];
+ int run = 0;
+ int64_t last_val = -1;
+ int compact_index = 0;
+
+ memset(state, 128, sizeof(state));
+ for (int i= 0; i<pixel_num+1; i++) {
+ int64_t val;
+ if (i == pixel_num) {
+ if (last_val == 0xFFFFFFFF) {
+ break; //i think
+ } else {
+ val = 1LL<<32;
+ }
+ } else
+ val = unit[p][i].val;
+// if (flip) TODO
+// val ^= (XX&0x80000000) ? 0 : 0x7FFFFFFF;
+
+ *((uint32_t *)(src[p] + unit[p][i].ndx)) = compact_index;
+
+ if (last_val != val) {
+ av_assert2(last_val < val);
+ if (lu) {
+ if (val - last_val == 1) {
+ run ++;
+ last_val = val;
+ } else {
+ av_log(0,0, "R%d %6d\n", lu, (int)(run));
+ put_symbol_inline(&sc->c, state[lu], run, 0, NULL, NULL);
+ if (run == 0)
+ lu ^= 1;
+ run = 0;
+ i--; // we did not encode val so we need to backstep
+ last_val ++;
+ continue;
+ }
+ } else {
+ av_assert2(run == 0);
+ av_log(0,0, "R%d %6d\n", lu, (int)(val - last_val - 1));
+ put_symbol_inline(&sc->c, state[lu], val - last_val - 1, 0, NULL, NULL);
+ if (val - last_val == 1)
+ lu ^= 1;
+ last_val = val;
+ }
+ compact_index ++;
+ }
+ }
+ }
+}
+
+//TODO once this is working consider factorizing with the 16bit integer version and see how it looks if its too messy or better
+static int encode_float32_rgb_frame(FFV1Context *f, FFV1SliceContext *sc,
+ const uint8_t *src[4],
+ int w, int h, const int stride[4], int ac)
+{
+ int x, y, p, i;
+ const int ring_size = f->context_model ? 3 : 2;
+ int32_t *sample[4][3];
+ const int pass1 = !!(f->avctx->flags & AV_CODEC_FLAG_PASS1);
+ int bits = 16; //TODO explain this in the specifciation, we have 32bits in but really encode max 16
+ int offset = 1 << bits;
+ int transparency = f->transparency;
+
+ sc->run_index = 0;
+
+ memset(RENAME(sc->sample_buffer), 0, ring_size * MAX_PLANES *
+ (w + 6) * sizeof(*RENAME(sc->sample_buffer)));
+
+ for (y = 0; y < h; y++) {
+ for (i = 0; i < ring_size; i++)
+ for (p = 0; p < MAX_PLANES; p++)
+ sample[p][i]= RENAME(sc->sample_buffer) + p*ring_size*(w+6) + ((h+i-y)%ring_size)*(w+6) + 3;
+
+ for (x = 0; x < w; x++) {
+ int b, g, r, av_uninit(a);
+ g = *((const uint32_t *)(src[0] + x*4 + stride[0]*y));
+ b = *((const uint32_t *)(src[1] + x*4 + stride[1]*y));
+ r = *((const uint32_t *)(src[2] + x*4 + stride[2]*y));
+ if (transparency)
+ a = *((const uint32_t *)(src[3] + x*4 + stride[3]*y));
+
+ if (sc->slice_coding_mode != 1) {
+ b -= g;
+ r -= g;
+ g += (b * sc->slice_rct_by_coef + r * sc->slice_rct_ry_coef) >> 2;
+ b += offset;
+ r += offset;
+ }
+
+ sample[0][0][x] = g;
+ sample[1][0][x] = b;
+ sample[2][0][x] = r;
+ sample[3][0][x] = a;
+ }
+ for (p = 0; p < 3 + transparency; p++) {
+ int ret;
+ sample[p][0][-1] = sample[p][1][0 ];
+ sample[p][1][ w] = sample[p][1][w-1];
+ ret = encode_line32(f, sc, f->avctx, w, sample[p], (p + 1) / 2,
+ bits + (sc->slice_coding_mode != 1), ac, pass1);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+
static int encode_slice(AVCodecContext *c, void *arg)
{
FFV1SliceContext *sc = arg;
@@ -1226,6 +1412,10 @@ retry:
}
if (sc->remap) {
+ //Both the 16bit and 32bit remap do exactly the same thing but with 16bits we can
+ //Implement this using a "histogram" while for 32bit that would be gb sized, thus a more
+ //complex implementation sorting pairs is used.
+ if (f->bits_per_raw_sample != 32) {
if (f->colorspace == 0 && c->pix_fmt != AV_PIX_FMT_YA8 && c->pix_fmt != AV_PIX_FMT_YAF16) {
const int cx = x >> f->chroma_h_shift;
const int cy = y >> f->chroma_v_shift;
@@ -1249,7 +1439,12 @@ retry:
} else
load_rgb_frame (f, sc, planes, width, height, p->linesize);
- encode_remap(f, sc);
+ encode_histogram_remap(f, sc);
+ } else {
+ Unit pairs[4][65536];
+ load_rgb_float32_frame(f, sc, planes, width, height, p->linesize, pairs);
+ encode_float32_remap(f, sc, planes, pairs);
+ }
}
if (ac == AC_GOLOMB_RICE) {
@@ -1281,6 +1476,8 @@ retry:
} else if (c->pix_fmt == AV_PIX_FMT_YA8 || c->pix_fmt == AV_PIX_FMT_YAF16) {
ret = encode_plane(f, sc, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 0, 2, ac);
ret |= encode_plane(f, sc, p->data[0] + (ps>>1) + ps*x + y*p->linesize[0], width, height, p->linesize[0], 1, 1, 2, ac);
+ } else if (f->bits_per_raw_sample == 32) {
+ ret = encode_float32_rgb_frame(f, sc, planes, width, height, p->linesize, ac);
} else if (f->use32bit) {
ret = encode_rgb_frame32(f, sc, planes, width, height, p->linesize, ac);
} else {
@@ -1388,6 +1585,14 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
return 0;
}
+ if (f->bits_per_raw_sample == 32 && f->remap_mode) {
+ //To reduce the needed memory we use the input frame (which is generally freely available)
+ //TODO use fltmap instead of the frame so as not to have to make it writable
+ int ret = av_frame_make_writable(pict);
+ if (ret < 0)
+ return ret;
+ }
+
/* Maximum packet size */
maxsize = ff_ffv1_encode_buffer_size(avctx);
@@ -1567,7 +1772,7 @@ const FFCodec ff_ffv1_encoder = {
AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV440P12,
AV_PIX_FMT_YAF16,
AV_PIX_FMT_GRAYF16,
- AV_PIX_FMT_GBRPF16),
+ AV_PIX_FMT_GBRPF16, AV_PIX_FMT_GBRPF32),
.color_ranges = AVCOL_RANGE_MPEG,
.p.priv_class = &ffv1_class,
.caps_internal = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH,
--
2.48.1
More information about the ffmpeg-devel
mailing list